summaryrefslogtreecommitdiffstats
path: root/media/libjpeg
diff options
context:
space:
mode:
Diffstat (limited to 'media/libjpeg')
-rw-r--r--media/libjpeg/ChangeLog.md1950
-rw-r--r--media/libjpeg/LICENSE.md132
-rw-r--r--media/libjpeg/MOZCHANGES163
-rw-r--r--media/libjpeg/README.ijg258
-rw-r--r--media/libjpeg/README.md357
-rw-r--r--media/libjpeg/jaricom.c157
-rw-r--r--media/libjpeg/jcapimin.c295
-rw-r--r--media/libjpeg/jcapistd.c162
-rw-r--r--media/libjpeg/jcarith.c932
-rw-r--r--media/libjpeg/jccoefct.c449
-rw-r--r--media/libjpeg/jccolext.c144
-rw-r--r--media/libjpeg/jccolor.c721
-rw-r--r--media/libjpeg/jcdctmgr.c720
-rw-r--r--media/libjpeg/jchuff.c1136
-rw-r--r--media/libjpeg/jchuff.h50
-rw-r--r--media/libjpeg/jcicc.c105
-rw-r--r--media/libjpeg/jcinit.c80
-rw-r--r--media/libjpeg/jcmainct.c162
-rw-r--r--media/libjpeg/jcmarker.c664
-rw-r--r--media/libjpeg/jcmaster.c639
-rw-r--r--media/libjpeg/jcomapi.c109
-rw-r--r--media/libjpeg/jconfig.h37
-rw-r--r--media/libjpeg/jconfigint.h54
-rw-r--r--media/libjpeg/jcparam.c541
-rw-r--r--media/libjpeg/jcphuff.c1113
-rw-r--r--media/libjpeg/jcprepct.c351
-rw-r--r--media/libjpeg/jcsample.c522
-rw-r--r--media/libjpeg/jctrans.c401
-rw-r--r--media/libjpeg/jdapimin.c407
-rw-r--r--media/libjpeg/jdapistd.c689
-rw-r--r--media/libjpeg/jdarith.c782
-rw-r--r--media/libjpeg/jdatadst.c287
-rw-r--r--media/libjpeg/jdatasrc.c295
-rw-r--r--media/libjpeg/jdcoefct.c878
-rw-r--r--media/libjpeg/jdcoefct.h83
-rw-r--r--media/libjpeg/jdcol565.c384
-rw-r--r--media/libjpeg/jdcolext.c141
-rw-r--r--media/libjpeg/jdcolor.c881
-rw-r--r--media/libjpeg/jdct.h208
-rw-r--r--media/libjpeg/jddctmgr.c352
-rw-r--r--media/libjpeg/jdhuff.c834
-rw-r--r--media/libjpeg/jdhuff.h247
-rw-r--r--media/libjpeg/jdicc.c167
-rw-r--r--media/libjpeg/jdinput.c408
-rw-r--r--media/libjpeg/jdmainct.c460
-rw-r--r--media/libjpeg/jdmainct.h71
-rw-r--r--media/libjpeg/jdmarker.c1374
-rw-r--r--media/libjpeg/jdmaster.c726
-rw-r--r--media/libjpeg/jdmaster.h28
-rw-r--r--media/libjpeg/jdmerge.c587
-rw-r--r--media/libjpeg/jdmerge.h47
-rw-r--r--media/libjpeg/jdmrg565.c354
-rw-r--r--media/libjpeg/jdmrgext.c184
-rw-r--r--media/libjpeg/jdphuff.c679
-rw-r--r--media/libjpeg/jdpostct.c294
-rw-r--r--media/libjpeg/jdsample.c524
-rw-r--r--media/libjpeg/jdsample.h50
-rw-r--r--media/libjpeg/jdtrans.c156
-rw-r--r--media/libjpeg/jerror.c251
-rw-r--r--media/libjpeg/jerror.h331
-rw-r--r--media/libjpeg/jfdctflt.c169
-rw-r--r--media/libjpeg/jfdctfst.c227
-rw-r--r--media/libjpeg/jfdctint.c288
-rw-r--r--media/libjpeg/jidctflt.c240
-rw-r--r--media/libjpeg/jidctfst.c371
-rw-r--r--media/libjpeg/jidctint.c2627
-rw-r--r--media/libjpeg/jidctred.c409
-rw-r--r--media/libjpeg/jinclude.h145
-rw-r--r--media/libjpeg/jmemmgr.c1180
-rw-r--r--media/libjpeg/jmemnobs.c110
-rw-r--r--media/libjpeg/jmemsys.h178
-rw-r--r--media/libjpeg/jmorecfg.h373
-rw-r--r--media/libjpeg/jpeg_nbits_table.h4098
-rw-r--r--media/libjpeg/jpegcomp.h32
-rw-r--r--media/libjpeg/jpegint.h375
-rw-r--r--media/libjpeg/jpeglib.h1132
-rw-r--r--media/libjpeg/jquant1.c856
-rw-r--r--media/libjpeg/jquant2.c1285
-rw-r--r--media/libjpeg/jsimd.h123
-rw-r--r--media/libjpeg/jsimd_none.c431
-rw-r--r--media/libjpeg/jsimddct.h70
-rw-r--r--media/libjpeg/jstdhuff.c144
-rw-r--r--media/libjpeg/jutils.c133
-rw-r--r--media/libjpeg/jversion.h54
-rw-r--r--media/libjpeg/moz.build323
-rw-r--r--media/libjpeg/mozilla.diff59
-rw-r--r--media/libjpeg/simd/arm/aarch32/jccolext-neon.c148
-rw-r--r--media/libjpeg/simd/arm/aarch32/jchuff-neon.c334
-rw-r--r--media/libjpeg/simd/arm/aarch32/jsimd.c976
-rw-r--r--media/libjpeg/simd/arm/aarch32/jsimd_neon.S1200
-rw-r--r--media/libjpeg/simd/arm/aarch64/jccolext-neon.c316
-rw-r--r--media/libjpeg/simd/arm/aarch64/jchuff-neon.c411
-rw-r--r--media/libjpeg/simd/arm/aarch64/jsimd.c1053
-rw-r--r--media/libjpeg/simd/arm/aarch64/jsimd_neon.S2254
-rw-r--r--media/libjpeg/simd/arm/align.h28
-rw-r--r--media/libjpeg/simd/arm/jccolor-neon.c160
-rw-r--r--media/libjpeg/simd/arm/jcgray-neon.c120
-rw-r--r--media/libjpeg/simd/arm/jcgryext-neon.c106
-rw-r--r--media/libjpeg/simd/arm/jchuff.h131
-rw-r--r--media/libjpeg/simd/arm/jcphuff-neon.c623
-rw-r--r--media/libjpeg/simd/arm/jcsample-neon.c192
-rw-r--r--media/libjpeg/simd/arm/jdcolext-neon.c374
-rw-r--r--media/libjpeg/simd/arm/jdcolor-neon.c141
-rw-r--r--media/libjpeg/simd/arm/jdmerge-neon.c144
-rw-r--r--media/libjpeg/simd/arm/jdmrgext-neon.c723
-rw-r--r--media/libjpeg/simd/arm/jdsample-neon.c569
-rw-r--r--media/libjpeg/simd/arm/jfdctfst-neon.c214
-rw-r--r--media/libjpeg/simd/arm/jfdctint-neon.c376
-rw-r--r--media/libjpeg/simd/arm/jidctfst-neon.c472
-rw-r--r--media/libjpeg/simd/arm/jidctint-neon.c801
-rw-r--r--media/libjpeg/simd/arm/jidctred-neon.c486
-rw-r--r--media/libjpeg/simd/arm/jquanti-neon.c193
-rw-r--r--media/libjpeg/simd/arm/neon-compat.h33
-rw-r--r--media/libjpeg/simd/i386/jccolext-avx2.asm578
-rw-r--r--media/libjpeg/simd/i386/jccolext-mmx.asm476
-rw-r--r--media/libjpeg/simd/i386/jccolext-sse2.asm503
-rw-r--r--media/libjpeg/simd/i386/jccolor-avx2.asm121
-rw-r--r--media/libjpeg/simd/i386/jccolor-mmx.asm121
-rw-r--r--media/libjpeg/simd/i386/jccolor-sse2.asm120
-rw-r--r--media/libjpeg/simd/i386/jcgray-avx2.asm113
-rw-r--r--media/libjpeg/simd/i386/jcgray-mmx.asm113
-rw-r--r--media/libjpeg/simd/i386/jcgray-sse2.asm112
-rw-r--r--media/libjpeg/simd/i386/jcgryext-avx2.asm457
-rw-r--r--media/libjpeg/simd/i386/jcgryext-mmx.asm355
-rw-r--r--media/libjpeg/simd/i386/jcgryext-sse2.asm382
-rw-r--r--media/libjpeg/simd/i386/jchuff-sse2.asm761
-rw-r--r--media/libjpeg/simd/i386/jcphuff-sse2.asm662
-rw-r--r--media/libjpeg/simd/i386/jcsample-avx2.asm388
-rw-r--r--media/libjpeg/simd/i386/jcsample-mmx.asm324
-rw-r--r--media/libjpeg/simd/i386/jcsample-sse2.asm351
-rw-r--r--media/libjpeg/simd/i386/jdcolext-avx2.asm515
-rw-r--r--media/libjpeg/simd/i386/jdcolext-mmx.asm404
-rw-r--r--media/libjpeg/simd/i386/jdcolext-sse2.asm458
-rw-r--r--media/libjpeg/simd/i386/jdcolor-avx2.asm118
-rw-r--r--media/libjpeg/simd/i386/jdcolor-mmx.asm117
-rw-r--r--media/libjpeg/simd/i386/jdcolor-sse2.asm117
-rw-r--r--media/libjpeg/simd/i386/jdmerge-avx2.asm136
-rw-r--r--media/libjpeg/simd/i386/jdmerge-mmx.asm123
-rw-r--r--media/libjpeg/simd/i386/jdmerge-sse2.asm135
-rw-r--r--media/libjpeg/simd/i386/jdmrgext-avx2.asm575
-rw-r--r--media/libjpeg/simd/i386/jdmrgext-mmx.asm460
-rw-r--r--media/libjpeg/simd/i386/jdmrgext-sse2.asm517
-rw-r--r--media/libjpeg/simd/i386/jdsample-avx2.asm760
-rw-r--r--media/libjpeg/simd/i386/jdsample-mmx.asm731
-rw-r--r--media/libjpeg/simd/i386/jdsample-sse2.asm724
-rw-r--r--media/libjpeg/simd/i386/jfdctflt-3dn.asm318
-rw-r--r--media/libjpeg/simd/i386/jfdctflt-sse.asm369
-rw-r--r--media/libjpeg/simd/i386/jfdctfst-mmx.asm395
-rw-r--r--media/libjpeg/simd/i386/jfdctfst-sse2.asm403
-rw-r--r--media/libjpeg/simd/i386/jfdctint-avx2.asm331
-rw-r--r--media/libjpeg/simd/i386/jfdctint-mmx.asm620
-rw-r--r--media/libjpeg/simd/i386/jfdctint-sse2.asm633
-rw-r--r--media/libjpeg/simd/i386/jidctflt-3dn.asm451
-rw-r--r--media/libjpeg/simd/i386/jidctflt-sse.asm571
-rw-r--r--media/libjpeg/simd/i386/jidctflt-sse2.asm497
-rw-r--r--media/libjpeg/simd/i386/jidctfst-mmx.asm499
-rw-r--r--media/libjpeg/simd/i386/jidctfst-sse2.asm501
-rw-r--r--media/libjpeg/simd/i386/jidctint-avx2.asm453
-rw-r--r--media/libjpeg/simd/i386/jidctint-mmx.asm851
-rw-r--r--media/libjpeg/simd/i386/jidctint-sse2.asm858
-rw-r--r--media/libjpeg/simd/i386/jidctred-mmx.asm704
-rw-r--r--media/libjpeg/simd/i386/jidctred-sse2.asm592
-rw-r--r--media/libjpeg/simd/i386/jquant-3dn.asm230
-rw-r--r--media/libjpeg/simd/i386/jquant-mmx.asm276
-rw-r--r--media/libjpeg/simd/i386/jquant-sse.asm208
-rw-r--r--media/libjpeg/simd/i386/jquantf-sse2.asm168
-rw-r--r--media/libjpeg/simd/i386/jquanti-avx2.asm188
-rw-r--r--media/libjpeg/simd/i386/jquanti-sse2.asm201
-rw-r--r--media/libjpeg/simd/i386/jsimd.c1312
-rw-r--r--media/libjpeg/simd/i386/jsimdcpu.asm135
-rw-r--r--media/libjpeg/simd/jsimd.h1258
-rw-r--r--media/libjpeg/simd/mips/jsimd.c1143
-rw-r--r--media/libjpeg/simd/mips/jsimd_dspr2.S4543
-rw-r--r--media/libjpeg/simd/mips/jsimd_dspr2_asm.h292
-rw-r--r--media/libjpeg/simd/mips64/jccolext-mmi.c455
-rw-r--r--media/libjpeg/simd/mips64/jccolor-mmi.c148
-rw-r--r--media/libjpeg/simd/mips64/jcgray-mmi.c132
-rw-r--r--media/libjpeg/simd/mips64/jcgryext-mmi.c374
-rw-r--r--media/libjpeg/simd/mips64/jcsample-mmi.c98
-rw-r--r--media/libjpeg/simd/mips64/jcsample.h28
-rw-r--r--media/libjpeg/simd/mips64/jdcolext-mmi.c415
-rw-r--r--media/libjpeg/simd/mips64/jdcolor-mmi.c139
-rw-r--r--media/libjpeg/simd/mips64/jdmerge-mmi.c149
-rw-r--r--media/libjpeg/simd/mips64/jdmrgext-mmi.c615
-rw-r--r--media/libjpeg/simd/mips64/jdsample-mmi.c304
-rw-r--r--media/libjpeg/simd/mips64/jfdctfst-mmi.c255
-rw-r--r--media/libjpeg/simd/mips64/jfdctint-mmi.c398
-rw-r--r--media/libjpeg/simd/mips64/jidctfst-mmi.c395
-rw-r--r--media/libjpeg/simd/mips64/jidctint-mmi.c571
-rw-r--r--media/libjpeg/simd/mips64/jquanti-mmi.c124
-rw-r--r--media/libjpeg/simd/mips64/jsimd.c866
-rw-r--r--media/libjpeg/simd/mips64/jsimd_mmi.h69
-rw-r--r--media/libjpeg/simd/mips64/loongson-mmintrin.h1334
-rw-r--r--media/libjpeg/simd/nasm/jcolsamp.inc135
-rw-r--r--media/libjpeg/simd/nasm/jdct.inc31
-rw-r--r--media/libjpeg/simd/nasm/jsimdcfg.inc93
-rw-r--r--media/libjpeg/simd/nasm/jsimdcfg.inc.h133
-rw-r--r--media/libjpeg/simd/nasm/jsimdext.inc520
-rw-r--r--media/libjpeg/simd/powerpc/jccolext-altivec.c269
-rw-r--r--media/libjpeg/simd/powerpc/jccolor-altivec.c116
-rw-r--r--media/libjpeg/simd/powerpc/jcgray-altivec.c111
-rw-r--r--media/libjpeg/simd/powerpc/jcgryext-altivec.c228
-rw-r--r--media/libjpeg/simd/powerpc/jcsample-altivec.c159
-rw-r--r--media/libjpeg/simd/powerpc/jcsample.h28
-rw-r--r--media/libjpeg/simd/powerpc/jdcolext-altivec.c276
-rw-r--r--media/libjpeg/simd/powerpc/jdcolor-altivec.c106
-rw-r--r--media/libjpeg/simd/powerpc/jdmerge-altivec.c130
-rw-r--r--media/libjpeg/simd/powerpc/jdmrgext-altivec.c329
-rw-r--r--media/libjpeg/simd/powerpc/jdsample-altivec.c400
-rw-r--r--media/libjpeg/simd/powerpc/jfdctfst-altivec.c154
-rw-r--r--media/libjpeg/simd/powerpc/jfdctint-altivec.c258
-rw-r--r--media/libjpeg/simd/powerpc/jidctfst-altivec.c255
-rw-r--r--media/libjpeg/simd/powerpc/jidctint-altivec.c357
-rw-r--r--media/libjpeg/simd/powerpc/jquanti-altivec.c250
-rw-r--r--media/libjpeg/simd/powerpc/jsimd.c884
-rw-r--r--media/libjpeg/simd/powerpc/jsimd_altivec.h98
-rw-r--r--media/libjpeg/simd/x86_64/jccolext-avx2.asm559
-rw-r--r--media/libjpeg/simd/x86_64/jccolext-sse2.asm484
-rw-r--r--media/libjpeg/simd/x86_64/jccolor-avx2.asm121
-rw-r--r--media/libjpeg/simd/x86_64/jccolor-sse2.asm120
-rw-r--r--media/libjpeg/simd/x86_64/jcgray-avx2.asm113
-rw-r--r--media/libjpeg/simd/x86_64/jcgray-sse2.asm112
-rw-r--r--media/libjpeg/simd/x86_64/jcgryext-avx2.asm438
-rw-r--r--media/libjpeg/simd/x86_64/jcgryext-sse2.asm363
-rw-r--r--media/libjpeg/simd/x86_64/jchuff-sse2.asm583
-rw-r--r--media/libjpeg/simd/x86_64/jcphuff-sse2.asm639
-rw-r--r--media/libjpeg/simd/x86_64/jcsample-avx2.asm367
-rw-r--r--media/libjpeg/simd/x86_64/jcsample-sse2.asm330
-rw-r--r--media/libjpeg/simd/x86_64/jdcolext-avx2.asm496
-rw-r--r--media/libjpeg/simd/x86_64/jdcolext-sse2.asm439
-rw-r--r--media/libjpeg/simd/x86_64/jdcolor-avx2.asm118
-rw-r--r--media/libjpeg/simd/x86_64/jdcolor-sse2.asm117
-rw-r--r--media/libjpeg/simd/x86_64/jdmerge-avx2.asm136
-rw-r--r--media/libjpeg/simd/x86_64/jdmerge-sse2.asm135
-rw-r--r--media/libjpeg/simd/x86_64/jdmrgext-avx2.asm596
-rw-r--r--media/libjpeg/simd/x86_64/jdmrgext-sse2.asm538
-rw-r--r--media/libjpeg/simd/x86_64/jdsample-avx2.asm696
-rw-r--r--media/libjpeg/simd/x86_64/jdsample-sse2.asm665
-rw-r--r--media/libjpeg/simd/x86_64/jfdctflt-sse.asm355
-rw-r--r--media/libjpeg/simd/x86_64/jfdctfst-sse2.asm389
-rw-r--r--media/libjpeg/simd/x86_64/jfdctint-avx2.asm320
-rw-r--r--media/libjpeg/simd/x86_64/jfdctint-sse2.asm619
-rw-r--r--media/libjpeg/simd/x86_64/jidctflt-sse2.asm482
-rw-r--r--media/libjpeg/simd/x86_64/jidctfst-sse2.asm491
-rw-r--r--media/libjpeg/simd/x86_64/jidctint-avx2.asm418
-rw-r--r--media/libjpeg/simd/x86_64/jidctint-sse2.asm847
-rw-r--r--media/libjpeg/simd/x86_64/jidctred-sse2.asm574
-rw-r--r--media/libjpeg/simd/x86_64/jquantf-sse2.asm155
-rw-r--r--media/libjpeg/simd/x86_64/jquanti-avx2.asm163
-rw-r--r--media/libjpeg/simd/x86_64/jquanti-sse2.asm188
-rw-r--r--media/libjpeg/simd/x86_64/jsimd.c1110
-rw-r--r--media/libjpeg/simd/x86_64/jsimdcpu.asm86
252 files changed, 110951 insertions, 0 deletions
diff --git a/media/libjpeg/ChangeLog.md b/media/libjpeg/ChangeLog.md
new file mode 100644
index 0000000000..1c1e6538a4
--- /dev/null
+++ b/media/libjpeg/ChangeLog.md
@@ -0,0 +1,1950 @@
+2.1.5.1
+=======
+
+### Significant changes relative to 2.1.5:
+
+1. The SIMD dispatchers in libjpeg-turbo 2.1.4 and prior stored the list of
+supported SIMD instruction sets in a global variable, which caused an innocuous
+race condition whereby the variable could have been initialized multiple times
+if `jpeg_start_*compress()` was called simultaneously in multiple threads.
+libjpeg-turbo 2.1.5 included an undocumented attempt to fix this race condition
+by making the SIMD support variable thread-local. However, that caused another
+issue whereby, if `jpeg_start_*compress()` was called in one thread and
+`jpeg_read_*()` or `jpeg_write_*()` was called in a second thread, the SIMD
+support variable was never initialized in the second thread. On x86 systems,
+this led the second thread to incorrectly assume that AVX2 instructions were
+always available, and when it attempted to use those instructions on older x86
+CPUs that do not support them, an illegal instruction error occurred. The SIMD
+dispatchers now ensure that the SIMD support variable is initialized before
+dispatching based on its value.
+
+
+2.1.5
+=====
+
+### Significant changes relative to 2.1.4:
+
+1. Fixed issues in the build system whereby, when using the Ninja Multi-Config
+CMake generator, a static build of libjpeg-turbo (a build in which
+`ENABLE_SHARED` is `0`) could not be installed, a Windows installer could not
+be built, and the Java regression tests failed.
+
+2. Fixed a regression introduced by 2.0 beta1[15] that caused a buffer overrun
+in the progressive Huffman encoder when attempting to transform a
+specially-crafted malformed 12-bit-per-component JPEG image into a progressive
+12-bit-per-component JPEG image using a 12-bit-per-component build of
+libjpeg-turbo (`-DWITH_12BIT=1`.) Given that the buffer overrun was fully
+contained within the progressive Huffman encoder structure and did not cause a
+segfault or other user-visible errant behavior, given that the lossless
+transformer (unlike the decompressor) is not generally exposed to arbitrary
+data exploits, and given that 12-bit-per-component builds of libjpeg-turbo are
+uncommon, this issue did not likely pose a security risk.
+
+3. Fixed an issue whereby, when using a 12-bit-per-component build of
+libjpeg-turbo (`-DWITH_12BIT=1`), passing samples with values greater than 4095
+or less than 0 to `jpeg_write_scanlines()` caused a buffer overrun or underrun
+in the RGB-to-YCbCr color converter.
+
+4. Fixed a floating point exception that occurred when attempting to use the
+jpegtran `-drop` and `-trim` options to losslessly transform a
+specially-crafted malformed JPEG image.
+
+5. Fixed an issue in `tjBufSizeYUV2()` whereby it returned a bogus result,
+rather than throwing an error, if the `align` parameter was not a power of 2.
+Fixed a similar issue in `tjCompressFromYUV()` whereby it generated a corrupt
+JPEG image in certain cases, rather than throwing an error, if the `align`
+parameter was not a power of 2.
+
+6. Fixed an issue whereby `tjDecompressToYUV2()`, which is a wrapper for
+`tjDecompressToYUVPlanes()`, used the desired YUV image dimensions rather than
+the actual scaled image dimensions when computing the plane pointers and
+strides to pass to `tjDecompressToYUVPlanes()`. This caused a buffer overrun
+and subsequent segfault if the desired image dimensions exceeded the scaled
+image dimensions.
+
+7. Fixed an issue whereby, when decompressing a 12-bit-per-component JPEG image
+(`-DWITH_12BIT=1`) using an alpha-enabled output color space such as
+`JCS_EXT_RGBA`, the alpha channel was set to 255 rather than 4095.
+
+8. Fixed an issue whereby the Java version of TJBench did not accept a range of
+quality values.
+
+9. Fixed an issue whereby, when `-progressive` was passed to TJBench, the JPEG
+input image was not transformed into a progressive JPEG image prior to
+decompression.
+
+
+2.1.4
+=====
+
+### Significant changes relative to 2.1.3:
+
+1. Fixed a regression introduced in 2.1.3 that caused build failures with
+Visual Studio 2010.
+
+2. The `tjDecompressHeader3()` function in the TurboJPEG C API and the
+`TJDecompressor.setSourceImage()` method in the TurboJPEG Java API now accept
+"abbreviated table specification" (AKA "tables-only") datastreams, which can be
+used to prime the decompressor with quantization and Huffman tables that can be
+used when decompressing subsequent "abbreviated image" datastreams.
+
+3. libjpeg-turbo now performs run-time detection of AltiVec instructions on
+OS X/PowerPC systems if AltiVec instructions are not enabled at compile time.
+This allows both AltiVec-equipped (PowerPC G4 and G5) and non-AltiVec-equipped
+(PowerPC G3) CPUs to be supported using the same build of libjpeg-turbo.
+
+4. Fixed an error ("Bogus virtual array access") that occurred when attempting
+to decompress a progressive JPEG image with a height less than or equal to one
+iMCU (8 * the vertical sampling factor) using buffered-image mode with
+interblock smoothing enabled. This was a regression introduced by
+2.1 beta1[6(b)].
+
+5. Fixed two issues that prevented partial image decompression from working
+properly with buffered-image mode:
+
+ - Attempting to call `jpeg_crop_scanline()` after
+`jpeg_start_decompress()` but before `jpeg_start_output()` resulted in an error
+("Improper call to JPEG library in state 207".)
+ - Attempting to use `jpeg_skip_scanlines()` resulted in an error ("Bogus
+virtual array access") under certain circumstances.
+
+
+2.1.3
+=====
+
+### Significant changes relative to 2.1.2:
+
+1. Fixed a regression introduced by 2.0 beta1[7] whereby cjpeg compressed PGM
+input files into full-color JPEG images unless the `-grayscale` option was
+used.
+
+2. cjpeg now automatically compresses GIF and 8-bit BMP input files into
+grayscale JPEG images if the input files contain only shades of gray.
+
+3. The build system now enables the intrinsics implementation of the AArch64
+(Arm 64-bit) Neon SIMD extensions by default when using GCC 12 or later.
+
+4. Fixed a segfault that occurred while decompressing a 4:2:0 JPEG image using
+the merged (non-fancy) upsampling algorithms (that is, with
+`cinfo.do_fancy_upsampling` set to `FALSE`) along with `jpeg_crop_scanline()`.
+Specifically, the segfault occurred if the number of bytes remaining in the
+output buffer was less than the number of bytes required to represent one
+uncropped scanline of the output image. For that reason, the issue could only
+be reproduced using the libjpeg API, not using djpeg.
+
+
+2.1.2
+=====
+
+### Significant changes relative to 2.1.1:
+
+1. Fixed a regression introduced by 2.1 beta1[13] that caused the remaining
+GAS implementations of AArch64 (Arm 64-bit) Neon SIMD functions (which are used
+by default with GCC for performance reasons) to be placed in the `.rodata`
+section rather than in the `.text` section. This caused the GNU linker to
+automatically place the `.rodata` section in an executable segment, which
+prevented libjpeg-turbo from working properly with other linkers and also
+represented a potential security risk.
+
+2. Fixed an issue whereby the `tjTransform()` function incorrectly computed the
+MCU block size for 4:4:4 JPEG images with non-unary sampling factors and thus
+unduly rejected some cropping regions, even though those regions aligned with
+8x8 MCU block boundaries.
+
+3. Fixed a regression introduced by 2.1 beta1[13] that caused the build system
+to enable the Arm Neon SIMD extensions when targetting Armv6 and other legacy
+architectures that do not support Neon instructions.
+
+4. libjpeg-turbo now performs run-time detection of AltiVec instructions on
+FreeBSD/PowerPC systems if AltiVec instructions are not enabled at compile
+time. This allows both AltiVec-equipped and non-AltiVec-equipped CPUs to be
+supported using the same build of libjpeg-turbo.
+
+5. cjpeg now accepts a `-strict` argument similar to that of djpeg and
+jpegtran, which causes the compressor to abort if an LZW-compressed GIF input
+image contains incomplete or corrupt image data.
+
+
+2.1.1
+=====
+
+### Significant changes relative to 2.1.0:
+
+1. Fixed a regression introduced in 2.1.0 that caused build failures with
+non-GCC-compatible compilers for Un*x/Arm platforms.
+
+2. Fixed a regression introduced by 2.1 beta1[13] that prevented the Arm 32-bit
+(AArch32) Neon SIMD extensions from building unless the C compiler flags
+included `-mfloat-abi=softfp` or `-mfloat-abi=hard`.
+
+3. Fixed an issue in the AArch32 Neon SIMD Huffman encoder whereby reliance on
+undefined C compiler behavior led to crashes ("SIGBUS: illegal alignment") on
+Android systems when running AArch32/Thumb builds of libjpeg-turbo built with
+recent versions of Clang.
+
+4. Added a command-line argument (`-copy icc`) to jpegtran that causes it to
+copy only the ICC profile markers from the source file and discard any other
+metadata.
+
+5. libjpeg-turbo should now build and run on CHERI-enabled architectures, which
+use capability pointers that are larger than the size of `size_t`.
+
+6. Fixed a regression (CVE-2021-37972) introduced by 2.1 beta1[5] that caused a
+segfault in the 64-bit SSE2 Huffman encoder when attempting to losslessly
+transform a specially-crafted malformed JPEG image.
+
+
+2.1.0
+=====
+
+### Significant changes relative to 2.1 beta1:
+
+1. Fixed a regression introduced by 2.1 beta1[6(b)] whereby attempting to
+decompress certain progressive JPEG images with one or more component planes of
+width 8 or less caused a buffer overrun.
+
+2. Fixed a regression introduced by 2.1 beta1[6(b)] whereby attempting to
+decompress a specially-crafted malformed progressive JPEG image caused the
+block smoothing algorithm to read from uninitialized memory.
+
+3. Fixed an issue in the Arm Neon SIMD Huffman encoders that caused the
+encoders to generate incorrect results when using the Clang compiler with
+Visual Studio.
+
+4. Fixed a floating point exception (CVE-2021-20205) that occurred when
+attempting to compress a specially-crafted malformed GIF image with a specified
+image width of 0 using cjpeg.
+
+5. Fixed a regression introduced by 2.0 beta1[15] whereby attempting to
+generate a progressive JPEG image on an SSE2-capable CPU using a scan script
+containing one or more scans with lengths divisible by 32 and non-zero
+successive approximation low bit positions would, under certain circumstances,
+result in an error ("Missing Huffman code table entry") and an invalid JPEG
+image.
+
+6. Introduced a new flag (`TJFLAG_LIMITSCANS` in the TurboJPEG C API and
+`TJ.FLAG_LIMIT_SCANS` in the TurboJPEG Java API) and a corresponding TJBench
+command-line argument (`-limitscans`) that causes the TurboJPEG decompression
+and transform functions/operations to return/throw an error if a progressive
+JPEG image contains an unreasonably large number of scans. This allows
+applications that use the TurboJPEG API to guard against an exploit of the
+progressive JPEG format described in the report
+["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+7. The PPM reader now throws an error, rather than segfaulting (due to a buffer
+overrun, CVE-2021-46822) or generating incorrect pixels, if an application
+attempts to use the `tjLoadImage()` function to load a 16-bit binary PPM file
+(a binary PPM file with a maximum value greater than 255) into a grayscale
+image buffer or to load a 16-bit binary PGM file into an RGB image buffer.
+
+8. Fixed an issue in the PPM reader that caused incorrect pixels to be
+generated when using the `tjLoadImage()` function to load a 16-bit binary PPM
+file into an extended RGB image buffer.
+
+9. Fixed an issue whereby, if a JPEG buffer was automatically re-allocated by
+one of the TurboJPEG compression or transform functions and an error
+subsequently occurred during compression or transformation, the JPEG buffer
+pointer passed by the application was not updated when the function returned.
+
+
+2.0.90 (2.1 beta1)
+==================
+
+### Significant changes relative to 2.0.6:
+
+1. The build system, x86-64 SIMD extensions, and accelerated Huffman codec now
+support the x32 ABI on Linux, which allows for using x86-64 instructions with
+32-bit pointers. The x32 ABI is generally enabled by adding `-mx32` to the
+compiler flags.
+
+ Caveats:
+ - CMake 3.9.0 or later is required in order for the build system to
+automatically detect an x32 build.
+ - Java does not support the x32 ABI, and thus the TurboJPEG Java API will
+automatically be disabled with x32 builds.
+
+2. Added Loongson MMI SIMD implementations of the RGB-to-grayscale, 4:2:2 fancy
+chroma upsampling, 4:2:2 and 4:2:0 merged chroma upsampling/color conversion,
+and fast integer DCT/IDCT algorithms. Relative to libjpeg-turbo 2.0.x, this
+speeds up:
+
+ - the compression of RGB source images into grayscale JPEG images by
+approximately 20%
+ - the decompression of 4:2:2 JPEG images by approximately 40-60% when
+using fancy upsampling
+ - the decompression of 4:2:2 and 4:2:0 JPEG images by approximately
+15-20% when using merged upsampling
+ - the compression of RGB source images by approximately 30-45% when using
+the fast integer DCT
+ - the decompression of JPEG images into RGB destination images by
+approximately 2x when using the fast integer IDCT
+
+ The overall decompression speedup for RGB images is now approximately
+2.3-3.7x (compared to 2-3.5x with libjpeg-turbo 2.0.x.)
+
+3. 32-bit (Armv7 or Armv7s) iOS builds of libjpeg-turbo are no longer
+supported, and the libjpeg-turbo build system can no longer be used to package
+such builds. 32-bit iOS apps cannot run in iOS 11 and later, and the App Store
+no longer allows them.
+
+4. 32-bit (i386) OS X/macOS builds of libjpeg-turbo are no longer supported,
+and the libjpeg-turbo build system can no longer be used to package such
+builds. 32-bit Mac applications cannot run in macOS 10.15 "Catalina" and
+later, and the App Store no longer allows them.
+
+5. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been
+significantly optimized, resulting in a measured average overall compression
+speedup of 12-28% for 64-bit code and 22-52% for 32-bit code on various Intel
+and AMD CPUs, as well as a measured average overall compression speedup of
+0-23% on platforms that do not have a SIMD-accelerated Huffman encoding
+implementation.
+
+6. The block smoothing algorithm that is applied by default when decompressing
+progressive Huffman-encoded JPEG images has been improved in the following
+ways:
+
+ - The algorithm is now more fault-tolerant. Previously, if a particular
+scan was incomplete, then the smoothing parameters for the incomplete scan
+would be applied to the entire output image, including the parts of the image
+that were generated by the prior (complete) scan. Visually, this had the
+effect of removing block smoothing from lower-frequency scans if they were
+followed by an incomplete higher-frequency scan. libjpeg-turbo now applies
+block smoothing parameters to each iMCU row based on which scan generated the
+pixels in that row, rather than always using the block smoothing parameters for
+the most recent scan.
+ - When applying block smoothing to DC scans, a Gaussian-like kernel with a
+5x5 window is used to reduce the "blocky" appearance.
+
+7. Added SIMD acceleration for progressive Huffman encoding on Arm platforms.
+This speeds up the compression of full-color progressive JPEGs by about 30-40%
+on average (relative to libjpeg-turbo 2.0.x) when using modern Arm CPUs.
+
+8. Added configure-time and run-time auto-detection of Loongson MMI SIMD
+instructions, so that the Loongson MMI SIMD extensions can be included in any
+MIPS64 libjpeg-turbo build.
+
+9. Added fault tolerance features to djpeg and jpegtran, mainly to demonstrate
+methods by which applications can guard against the exploits of the JPEG format
+described in the report
+["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+ - Both programs now accept a `-maxscans` argument, which can be used to
+limit the number of allowable scans in the input file.
+ - Both programs now accept a `-strict` argument, which can be used to
+treat all warnings as fatal.
+
+10. CMake package config files are now included for both the libjpeg and
+TurboJPEG API libraries. This facilitates using libjpeg-turbo with CMake's
+`find_package()` function. For example:
+
+ find_package(libjpeg-turbo CONFIG REQUIRED)
+
+ add_executable(libjpeg_program libjpeg_program.c)
+ target_link_libraries(libjpeg_program PUBLIC libjpeg-turbo::jpeg)
+
+ add_executable(libjpeg_program_static libjpeg_program.c)
+ target_link_libraries(libjpeg_program_static PUBLIC
+ libjpeg-turbo::jpeg-static)
+
+ add_executable(turbojpeg_program turbojpeg_program.c)
+ target_link_libraries(turbojpeg_program PUBLIC
+ libjpeg-turbo::turbojpeg)
+
+ add_executable(turbojpeg_program_static turbojpeg_program.c)
+ target_link_libraries(turbojpeg_program_static PUBLIC
+ libjpeg-turbo::turbojpeg-static)
+
+11. Since the Unisys LZW patent has long expired, cjpeg and djpeg can now
+read/write both LZW-compressed and uncompressed GIF files (feature ported from
+jpeg-6a and jpeg-9d.)
+
+12. jpegtran now includes the `-wipe` and `-drop` options from jpeg-9a and
+jpeg-9d, as well as the ability to expand the image size using the `-crop`
+option. Refer to jpegtran.1 or usage.txt for more details.
+
+13. Added a complete intrinsics implementation of the Arm Neon SIMD extensions,
+thus providing SIMD acceleration on Arm platforms for all of the algorithms
+that are SIMD-accelerated on x86 platforms. This new implementation is
+significantly faster in some cases than the old GAS implementation--
+depending on the algorithms used, the type of CPU core, and the compiler. GCC,
+as of this writing, does not provide a full or optimal set of Neon intrinsics,
+so for performance reasons, the default when building libjpeg-turbo with GCC is
+to continue using the GAS implementation of the following algorithms:
+
+ - 32-bit RGB-to-YCbCr color conversion
+ - 32-bit fast and accurate inverse DCT
+ - 64-bit RGB-to-YCbCr and YCbCr-to-RGB color conversion
+ - 64-bit accurate forward and inverse DCT
+ - 64-bit Huffman encoding
+
+ A new CMake variable (`NEON_INTRINSICS`) can be used to override this
+default.
+
+ Since the new intrinsics implementation includes SIMD acceleration
+for merged upsampling/color conversion, 1.5.1[5] is no longer necessary and has
+been reverted.
+
+14. The Arm Neon SIMD extensions can now be built using Visual Studio.
+
+15. The build system can now be used to generate a universal x86-64 + Armv8
+libjpeg-turbo SDK package for both iOS and macOS.
+
+
+2.0.6
+=====
+
+### Significant changes relative to 2.0.5:
+
+1. Fixed "using JNI after critical get" errors that occurred on Android
+platforms when using any of the YUV encoding/compression/decompression/decoding
+methods in the TurboJPEG Java API.
+
+2. Fixed or worked around multiple issues with `jpeg_skip_scanlines()`:
+
+ - Fixed segfaults (CVE-2020-35538) or "Corrupt JPEG data: premature end of
+data segment" errors in `jpeg_skip_scanlines()` that occurred when
+decompressing 4:2:2 or 4:2:0 JPEG images using merged (non-fancy)
+upsampling/color conversion (that is, when setting `cinfo.do_fancy_upsampling`
+to `FALSE`.) 2.0.0[6] was a similar fix, but it did not cover all cases.
+ - `jpeg_skip_scanlines()` now throws an error if two-pass color
+quantization is enabled. Two-pass color quantization never worked properly
+with `jpeg_skip_scanlines()`, and the issues could not readily be fixed.
+ - Fixed an issue whereby `jpeg_skip_scanlines()` always returned 0 when
+skipping past the end of an image.
+
+3. The Arm 64-bit (Armv8) Neon SIMD extensions can now be built using MinGW
+toolchains targetting Arm64 (AArch64) Windows binaries.
+
+4. Fixed unexpected visual artifacts that occurred when using
+`jpeg_crop_scanline()` and interblock smoothing while decompressing only the DC
+scan of a progressive JPEG image.
+
+5. Fixed an issue whereby libjpeg-turbo would not build if 12-bit-per-component
+JPEG support (`WITH_12BIT`) was enabled along with libjpeg v7 or libjpeg v8
+API/ABI emulation (`WITH_JPEG7` or `WITH_JPEG8`.)
+
+
+2.0.5
+=====
+
+### Significant changes relative to 2.0.4:
+
+1. Worked around issues in the MIPS DSPr2 SIMD extensions that caused failures
+in the libjpeg-turbo regression tests. Specifically, the
+`jsimd_h2v1_downsample_dspr2()` and `jsimd_h2v2_downsample_dspr2()` functions
+in the MIPS DSPr2 SIMD extensions are now disabled until/unless they can be
+fixed, and other functions that are incompatible with big endian MIPS CPUs are
+disabled when building libjpeg-turbo for such CPUs.
+
+2. Fixed an oversight in the `TJCompressor.compress(int)` method in the
+TurboJPEG Java API that caused an error ("java.lang.IllegalStateException: No
+source image is associated with this instance") when attempting to use that
+method to compress a YUV image.
+
+3. Fixed an issue (CVE-2020-13790) in the PPM reader that caused a buffer
+overrun in cjpeg, TJBench, or the `tjLoadImage()` function if one of the values
+in a binary PPM/PGM input file exceeded the maximum value defined in the file's
+header and that maximum value was less than 255. libjpeg-turbo 1.5.0 already
+included a similar fix for binary PPM/PGM files with maximum values greater
+than 255.
+
+4. The TurboJPEG API library's global error handler, which is used in functions
+such as `tjBufSize()` and `tjLoadImage()` that do not require a TurboJPEG
+instance handle, is now thread-safe on platforms that support thread-local
+storage.
+
+
+2.0.4
+=====
+
+### Significant changes relative to 2.0.3:
+
+1. Fixed a regression in the Windows packaging system (introduced by
+2.0 beta1[2]) whereby, if both the 64-bit libjpeg-turbo SDK for GCC and the
+64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
+one of them could be uninstalled.
+
+2. Fixed a signed integer overflow and subsequent segfault that occurred when
+attempting to decompress images with more than 715827882 pixels using the
+64-bit C version of TJBench.
+
+3. Fixed out-of-bounds write in `tjDecompressToYUV2()` and
+`tjDecompressToYUVPlanes()` (sometimes manifesting as a double free) that
+occurred when attempting to decompress grayscale JPEG images that were
+compressed with a sampling factor other than 1 (for instance, with
+`cjpeg -grayscale -sample 2x2`).
+
+4. Fixed a regression introduced by 2.0.2[5] that caused the TurboJPEG API to
+incorrectly identify some JPEG images with unusual sampling factors as 4:4:4
+JPEG images. This was known to cause a buffer overflow when attempting to
+decompress some such images using `tjDecompressToYUV2()` or
+`tjDecompressToYUVPlanes()`.
+
+5. Fixed an issue (CVE-2020-17541), detected by ASan, whereby attempting to
+losslessly transform a specially-crafted malformed JPEG image containing an
+extremely-high-frequency coefficient block (junk image data that could never be
+generated by a legitimate JPEG compressor) could cause the Huffman encoder's
+local buffer to be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].) Given that
+the buffer overrun was fully contained within the stack and did not cause a
+segfault or other user-visible errant behavior, and given that the lossless
+transformer (unlike the decompressor) is not generally exposed to arbitrary
+data exploits, this issue did not likely pose a security risk.
+
+6. The Arm 64-bit (Armv8) Neon SIMD assembly code now stores constants in a
+separate read-only data section rather than in the text section, to support
+execute-only memory layouts.
+
+
+2.0.3
+=====
+
+### Significant changes relative to 2.0.2:
+
+1. Fixed "using JNI after critical get" errors that occurred on Android
+platforms when passing invalid arguments to certain methods in the TurboJPEG
+Java API.
+
+2. Fixed a regression in the SIMD feature detection code, introduced by
+the AVX2 SIMD extensions (2.0 beta1[1]), that was known to cause an illegal
+instruction exception, in rare cases, on CPUs that lack support for CPUID leaf
+07H (or on which the maximum CPUID leaf has been limited by way of a BIOS
+setting.)
+
+3. The 4:4:0 (h1v2) fancy (smooth) chroma upsampling algorithm in the
+decompressor now uses a similar bias pattern to that of the 4:2:2 (h2v1) fancy
+chroma upsampling algorithm, rounding up or down the upsampled result for
+alternate pixels rather than always rounding down. This ensures that,
+regardless of whether a 4:2:2 JPEG image is rotated or transposed prior to
+decompression (in the frequency domain) or after decompression (in the spatial
+domain), the final image will be similar.
+
+4. Fixed an integer overflow and subsequent segfault that occurred when
+attempting to compress or decompress images with more than 1 billion pixels
+using the TurboJPEG API.
+
+5. Fixed a regression introduced by 2.0 beta1[15] whereby attempting to
+generate a progressive JPEG image on an SSE2-capable CPU using a scan script
+containing one or more scans with lengths divisible by 16 would result in an
+error ("Missing Huffman code table entry") and an invalid JPEG image.
+
+6. Fixed an issue whereby `tjDecodeYUV()` and `tjDecodeYUVPlanes()` would throw
+an error ("Invalid progressive parameters") or a warning ("Inconsistent
+progression sequence") if passed a TurboJPEG instance that was previously used
+to decompress a progressive JPEG image.
+
+
+2.0.2
+=====
+
+### Significant changes relative to 2.0.1:
+
+1. Fixed a regression introduced by 2.0.1[5] that prevented a runtime search
+path (rpath) from being embedded in the libjpeg-turbo shared libraries and
+executables for macOS and iOS. This caused a fatal error of the form
+"dyld: Library not loaded" when attempting to use one of the executables,
+unless `DYLD_LIBRARY_PATH` was explicitly set to the location of the
+libjpeg-turbo shared libraries.
+
+2. Fixed an integer overflow and subsequent segfault (CVE-2018-20330) that
+occurred when attempting to load a BMP file with more than 1 billion pixels
+using the `tjLoadImage()` function.
+
+3. Fixed a buffer overrun (CVE-2018-19664) that occurred when attempting to
+decompress a specially-crafted malformed JPEG image to a 256-color BMP using
+djpeg.
+
+4. Fixed a floating point exception that occurred when attempting to
+decompress a specially-crafted malformed JPEG image with a specified image
+width or height of 0 using the C version of TJBench.
+
+5. The TurboJPEG API will now decompress 4:4:4 JPEG images with 2x1, 1x2, 3x1,
+or 1x3 luminance and chrominance sampling factors. This is a non-standard way
+of specifying 1x subsampling (normally 4:4:4 JPEGs have 1x1 luminance and
+chrominance sampling factors), but the JPEG format and the libjpeg API both
+allow it.
+
+6. Fixed a regression introduced by 2.0 beta1[7] that caused djpeg to generate
+incorrect PPM images when used with the `-colors` option.
+
+7. Fixed an issue whereby a static build of libjpeg-turbo (a build in which
+`ENABLE_SHARED` is `0`) could not be installed using the Visual Studio IDE.
+
+8. Fixed a severe performance issue in the Loongson MMI SIMD extensions that
+occurred when compressing RGB images whose image rows were not 64-bit-aligned.
+
+
+2.0.1
+=====
+
+### Significant changes relative to 2.0.0:
+
+1. Fixed a regression introduced with the new CMake-based Un*x build system,
+whereby jconfig.h could cause compiler warnings of the form
+`"HAVE_*_H" redefined` if it was included by downstream Autotools-based
+projects that used `AC_CHECK_HEADERS()` to check for the existence of locale.h,
+stddef.h, or stdlib.h.
+
+2. The `jsimd_quantize_float_dspr2()` and `jsimd_convsamp_float_dspr2()`
+functions in the MIPS DSPr2 SIMD extensions are now disabled at compile time
+if the soft float ABI is enabled. Those functions use instructions that are
+incompatible with the soft float ABI.
+
+3. Fixed a regression in the SIMD feature detection code, introduced by
+the AVX2 SIMD extensions (2.0 beta1[1]), that caused libjpeg-turbo to crash on
+Windows 7 if Service Pack 1 was not installed.
+
+4. Fixed out-of-bounds read in cjpeg that occurred when attempting to compress
+a specially-crafted malformed color-index (8-bit-per-sample) Targa file in
+which some of the samples (color indices) exceeded the bounds of the Targa
+file's color table.
+
+5. Fixed an issue whereby installing a fully static build of libjpeg-turbo
+(a build in which `CFLAGS` contains `-static` and `ENABLE_SHARED` is `0`) would
+fail with "No valid ELF RPATH or RUNPATH entry exists in the file."
+
+
+2.0.0
+=====
+
+### Significant changes relative to 2.0 beta1:
+
+1. The TurboJPEG API can now decompress CMYK JPEG images that have subsampled M
+and Y components (not to be confused with YCCK JPEG images, in which the C/M/Y
+components have been transformed into luma and chroma.) Previously, an error
+was generated ("Could not determine subsampling type for JPEG image") when such
+an image was passed to `tjDecompressHeader3()`, `tjTransform()`,
+`tjDecompressToYUVPlanes()`, `tjDecompressToYUV2()`, or the equivalent Java
+methods.
+
+2. Fixed an issue (CVE-2018-11813) whereby a specially-crafted malformed input
+file (specifically, a file with a valid Targa header but incomplete pixel data)
+would cause cjpeg to generate a JPEG file that was potentially thousands of
+times larger than the input file. The Targa reader in cjpeg was not properly
+detecting that the end of the input file had been reached prematurely, so after
+all valid pixels had been read from the input, the reader injected dummy pixels
+with values of 255 into the JPEG compressor until the number of pixels
+specified in the Targa header had been compressed. The Targa reader in cjpeg
+now behaves like the PPM reader and aborts compression if the end of the input
+file is reached prematurely. Because this issue only affected cjpeg and not
+the underlying library, and because it did not involve any out-of-bounds reads
+or other exploitable behaviors, it was not believed to represent a security
+threat.
+
+3. Fixed an issue whereby the `tjLoadImage()` and `tjSaveImage()` functions
+would produce a "Bogus message code" error message if the underlying bitmap and
+PPM readers/writers threw an error that was specific to the readers/writers
+(as opposed to a general libjpeg API error.)
+
+4. Fixed an issue (CVE-2018-1152) whereby a specially-crafted malformed BMP
+file, one in which the header specified an image width of 1073741824 pixels,
+would trigger a floating point exception (division by zero) in the
+`tjLoadImage()` function when attempting to load the BMP file into a
+4-component image buffer.
+
+5. Fixed an issue whereby certain combinations of calls to
+`jpeg_skip_scanlines()` and `jpeg_read_scanlines()` could trigger an infinite
+loop when decompressing progressive JPEG images that use vertical chroma
+subsampling (for instance, 4:2:0 or 4:4:0.)
+
+6. Fixed a segfault in `jpeg_skip_scanlines()` that occurred when decompressing
+a 4:2:2 or 4:2:0 JPEG image using the merged (non-fancy) upsampling algorithms
+(that is, when setting `cinfo.do_fancy_upsampling` to `FALSE`.)
+
+7. The new CMake-based build system will now disable the MIPS DSPr2 SIMD
+extensions if it detects that the compiler does not support DSPr2 instructions.
+
+8. Fixed out-of-bounds read in cjpeg (CVE-2018-14498) that occurred when
+attempting to compress a specially-crafted malformed color-index
+(8-bit-per-sample) BMP file in which some of the samples (color indices)
+exceeded the bounds of the BMP file's color table.
+
+9. Fixed a signed integer overflow in the progressive Huffman decoder, detected
+by the Clang and GCC undefined behavior sanitizers, that could be triggered by
+attempting to decompress a specially-crafted malformed JPEG image. This issue
+did not pose a security threat, but removing the warning made it easier to
+detect actual security issues, should they arise in the future.
+
+
+1.5.90 (2.0 beta1)
+==================
+
+### Significant changes relative to 1.5.3:
+
+1. Added AVX2 SIMD implementations of the colorspace conversion, chroma
+downsampling and upsampling, integer quantization and sample conversion, and
+accurate integer DCT/IDCT algorithms. When using the accurate integer DCT/IDCT
+algorithms on AVX2-equipped CPUs, the compression of RGB images is
+approximately 13-36% (avg. 22%) faster (relative to libjpeg-turbo 1.5.x) with
+64-bit code and 11-21% (avg. 17%) faster with 32-bit code, and the
+decompression of RGB images is approximately 9-35% (avg. 17%) faster with
+64-bit code and 7-17% (avg. 12%) faster with 32-bit code. (As tested on a
+3 GHz Intel Core i7. Actual mileage may vary.)
+
+2. Overhauled the build system to use CMake on all platforms, and removed the
+autotools-based build system. This decision resulted from extensive
+discussions within the libjpeg-turbo community. libjpeg-turbo traditionally
+used CMake only for Windows builds, but there was an increasing amount of
+demand to extend CMake support to other platforms. However, because of the
+unique nature of our code base (the need to support different assemblers on
+each platform, the need for Java support, etc.), providing dual build systems
+as other OSS imaging libraries do (including libpng and libtiff) would have
+created a maintenance burden. The use of CMake greatly simplifies some aspects
+of our build system, owing to CMake's built-in support for various assemblers,
+Java, and unit testing, as well as generally fewer quirks that have to be
+worked around in order to implement our packaging system. Eliminating
+autotools puts our project slightly at odds with the traditional practices of
+the OSS community, since most "system libraries" tend to be built with
+autotools, but it is believed that the benefits of this move outweigh the
+risks. In addition to providing a unified build environment, switching to
+CMake allows for the use of various build tools and IDEs that aren't supported
+under autotools, including XCode, Ninja, and Eclipse. It also eliminates the
+need to install autotools via MacPorts/Homebrew on OS X and allows
+libjpeg-turbo to be configured without the use of a terminal/command prompt.
+Extensive testing was conducted to ensure that all features provided by the
+autotools-based build system are provided by the new build system.
+
+3. The libjpeg API in this version of libjpeg-turbo now includes two additional
+functions, `jpeg_read_icc_profile()` and `jpeg_write_icc_profile()`, that can
+be used to extract ICC profile data from a JPEG file while decompressing or to
+embed ICC profile data in a JPEG file while compressing or transforming. This
+eliminates the need for downstream projects, such as color management libraries
+and browsers, to include their own glueware for accomplishing this.
+
+4. Improved error handling in the TurboJPEG API library:
+
+ - Introduced a new function (`tjGetErrorStr2()`) in the TurboJPEG C API
+that allows compression/decompression/transform error messages to be retrieved
+in a thread-safe manner. Retrieving error messages from global functions, such
+as `tjInitCompress()` or `tjBufSize()`, is still thread-unsafe, but since those
+functions will only throw errors if passed an invalid argument or if a memory
+allocation failure occurs, thread safety is not as much of a concern.
+ - Introduced a new function (`tjGetErrorCode()`) in the TurboJPEG C API
+and a new method (`TJException.getErrorCode()`) in the TurboJPEG Java API that
+can be used to determine the severity of the last
+compression/decompression/transform error. This allows applications to
+choose whether to ignore warnings (non-fatal errors) from the underlying
+libjpeg API or to treat them as fatal.
+ - Introduced a new flag (`TJFLAG_STOPONWARNING` in the TurboJPEG C API and
+`TJ.FLAG_STOPONWARNING` in the TurboJPEG Java API) that causes the library to
+immediately halt a compression/decompression/transform operation if it
+encounters a warning from the underlying libjpeg API (the default behavior is
+to allow the operation to complete unless a fatal error is encountered.)
+
+5. Introduced a new flag in the TurboJPEG C and Java APIs (`TJFLAG_PROGRESSIVE`
+and `TJ.FLAG_PROGRESSIVE`, respectively) that causes the library to use
+progressive entropy coding in JPEG images generated by compression and
+transform operations. Additionally, a new transform option
+(`TJXOPT_PROGRESSIVE` in the C API and `TJTransform.OPT_PROGRESSIVE` in the
+Java API) has been introduced, allowing progressive entropy coding to be
+enabled for selected transforms in a multi-transform operation.
+
+6. Introduced a new transform option in the TurboJPEG API (`TJXOPT_COPYNONE` in
+the C API and `TJTransform.OPT_COPYNONE` in the Java API) that allows the
+copying of markers (including EXIF and ICC profile data) to be disabled for a
+particular transform.
+
+7. Added two functions to the TurboJPEG C API (`tjLoadImage()` and
+`tjSaveImage()`) that can be used to load/save a BMP or PPM/PGM image to/from a
+memory buffer with a specified pixel format and layout. These functions
+replace the project-private (and slow) bmp API, which was previously used by
+TJBench, and they also provide a convenient way for first-time users of
+libjpeg-turbo to quickly develop a complete JPEG compression/decompression
+program.
+
+8. The TurboJPEG C API now includes a new convenience array (`tjAlphaOffset[]`)
+that contains the alpha component index for each pixel format (or -1 if the
+pixel format lacks an alpha component.) The TurboJPEG Java API now includes a
+new method (`TJ.getAlphaOffset()`) that returns the same value. In addition,
+the `tjRedOffset[]`, `tjGreenOffset[]`, and `tjBlueOffset[]` arrays-- and the
+corresponding `TJ.getRedOffset()`, `TJ.getGreenOffset()`, and
+`TJ.getBlueOffset()` methods-- now return -1 for `TJPF_GRAY`/`TJ.PF_GRAY`
+rather than 0. This allows programs to easily determine whether a pixel format
+has red, green, blue, and alpha components.
+
+9. Added a new example (tjexample.c) that demonstrates the basic usage of the
+TurboJPEG C API. This example mirrors the functionality of TJExample.java.
+Both files are now included in the libjpeg-turbo documentation.
+
+10. Fixed two signed integer overflows in the arithmetic decoder, detected by
+the Clang undefined behavior sanitizer, that could be triggered by attempting
+to decompress a specially-crafted malformed JPEG image. These issues did not
+pose a security threat, but removing the warnings makes it easier to detect
+actual security issues, should they arise in the future.
+
+11. Fixed a bug in the merged 4:2:0 upsampling/dithered RGB565 color conversion
+algorithm that caused incorrect dithering in the output image. This algorithm
+now produces bitwise-identical results to the unmerged algorithms.
+
+12. The SIMD function symbols for x86[-64]/ELF, MIPS/ELF, macOS/x86[-64] (if
+libjpeg-turbo is built with Yasm), and iOS/Arm[64] builds are now private.
+This prevents those symbols from being exposed in applications or shared
+libraries that link statically with libjpeg-turbo.
+
+13. Added Loongson MMI SIMD implementations of the RGB-to-YCbCr and
+YCbCr-to-RGB colorspace conversion, 4:2:0 chroma downsampling, 4:2:0 fancy
+chroma upsampling, integer quantization, and accurate integer DCT/IDCT
+algorithms. When using the accurate integer DCT/IDCT, this speeds up the
+compression of RGB images by approximately 70-100% and the decompression of RGB
+images by approximately 2-3.5x.
+
+14. Fixed a build error when building with older MinGW releases (regression
+caused by 1.5.1[7].)
+
+15. Added SIMD acceleration for progressive Huffman encoding on SSE2-capable
+x86 and x86-64 platforms. This speeds up the compression of full-color
+progressive JPEGs by about 85-90% on average (relative to libjpeg-turbo 1.5.x)
+when using modern Intel and AMD CPUs.
+
+
+1.5.3
+=====
+
+### Significant changes relative to 1.5.2:
+
+1. Fixed a NullPointerException in the TurboJPEG Java wrapper that occurred
+when using the YUVImage constructor that creates an instance backed by separate
+image planes and allocates memory for the image planes.
+
+2. Fixed an issue whereby the Java version of TJUnitTest would fail when
+testing BufferedImage encoding/decoding on big endian systems.
+
+3. Fixed a segfault in djpeg that would occur if an output format other than
+PPM/PGM was selected along with the `-crop` option. The `-crop` option now
+works with the GIF and Targa formats as well (unfortunately, it cannot be made
+to work with the BMP and RLE formats due to the fact that those output engines
+write scanlines in bottom-up order.) djpeg will now exit gracefully if an
+output format other than PPM/PGM, GIF, or Targa is selected along with the
+`-crop` option.
+
+4. Fixed an issue (CVE-2017-15232) whereby `jpeg_skip_scanlines()` would
+segfault if color quantization was enabled.
+
+5. TJBench (both C and Java versions) will now display usage information if any
+command-line argument is unrecognized. This prevents the program from silently
+ignoring typos.
+
+6. Fixed an access violation in tjbench.exe (Windows) that occurred when the
+program was used to decompress an existing JPEG image.
+
+7. Fixed an ArrayIndexOutOfBoundsException in the TJExample Java program that
+occurred when attempting to decompress a JPEG image that had been compressed
+with 4:1:1 chrominance subsampling.
+
+8. Fixed an issue whereby, when using `jpeg_skip_scanlines()` to skip to the
+end of a single-scan (non-progressive) image, subsequent calls to
+`jpeg_consume_input()` would return `JPEG_SUSPENDED` rather than
+`JPEG_REACHED_EOI`.
+
+9. `jpeg_crop_scanline()` now works correctly when decompressing grayscale JPEG
+images that were compressed with a sampling factor other than 1 (for instance,
+with `cjpeg -grayscale -sample 2x2`).
+
+
+1.5.2
+=====
+
+### Significant changes relative to 1.5.1:
+
+1. Fixed a regression introduced by 1.5.1[7] that prevented libjpeg-turbo from
+building with Android NDK platforms prior to android-21 (5.0).
+
+2. Fixed a regression introduced by 1.5.1[1] that prevented the MIPS DSPR2 SIMD
+code in libjpeg-turbo from building.
+
+3. Fixed a regression introduced by 1.5 beta1[11] that prevented the Java
+version of TJBench from outputting any reference images (the `-nowrite` switch
+was accidentally enabled by default.)
+
+4. libjpeg-turbo should now build and run with full AltiVec SIMD acceleration
+on PowerPC-based AmigaOS 4 and OpenBSD systems.
+
+5. Fixed build and runtime errors on Windows that occurred when building
+libjpeg-turbo with libjpeg v7 API/ABI emulation and the in-memory
+source/destination managers. Due to an oversight, the `jpeg_skip_scanlines()`
+and `jpeg_crop_scanline()` functions were not being included in jpeg7.dll when
+libjpeg-turbo was built with `-DWITH_JPEG7=1` and `-DWITH_MEMSRCDST=1`.
+
+6. Fixed "Bogus virtual array access" error that occurred when using the
+lossless crop feature in jpegtran or the TurboJPEG API, if libjpeg-turbo was
+built with libjpeg v7 API/ABI emulation. This was apparently a long-standing
+bug that has existed since the introduction of libjpeg v7/v8 API/ABI emulation
+in libjpeg-turbo v1.1.
+
+7. The lossless transform features in jpegtran and the TurboJPEG API will now
+always attempt to adjust the EXIF image width and height tags if the image size
+changed as a result of the transform. This behavior has always existed when
+using libjpeg v8 API/ABI emulation. It was supposed to be available with
+libjpeg v7 API/ABI emulation as well but did not work properly due to a bug.
+Furthermore, there was never any good reason not to enable it with libjpeg v6b
+API/ABI emulation, since the behavior is entirely internal. Note that
+`-copy all` must be passed to jpegtran in order to transfer the EXIF tags from
+the source image to the destination image.
+
+8. Fixed several memory leaks in the TurboJPEG API library that could occur
+if the library was built with certain compilers and optimization levels
+(known to occur with GCC 4.x and clang with `-O1` and higher but not with
+GCC 5.x or 6.x) and one of the underlying libjpeg API functions threw an error
+after a TurboJPEG API function allocated a local buffer.
+
+9. The libjpeg-turbo memory manager will now honor the `max_memory_to_use`
+structure member in jpeg\_memory\_mgr, which can be set to the maximum amount
+of memory (in bytes) that libjpeg-turbo should use during decompression or
+multi-pass (including progressive) compression. This limit can also be set
+using the `JPEGMEM` environment variable or using the `-maxmemory` switch in
+cjpeg/djpeg/jpegtran (refer to the respective man pages for more details.)
+This has been a documented feature of libjpeg since v5, but the
+`malloc()`/`free()` implementation of the memory manager (jmemnobs.c) never
+implemented the feature. Restricting libjpeg-turbo's memory usage is useful
+for two reasons: it allows testers to more easily work around the 2 GB limit
+in libFuzzer, and it allows developers of security-sensitive applications to
+more easily defend against one of the progressive JPEG exploits (LJT-01-004)
+identified in
+[this report](http://www.libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+10. TJBench will now run each benchmark for 1 second prior to starting the
+timer, in order to improve the consistency of the results. Furthermore, the
+`-warmup` option is now used to specify the amount of warmup time rather than
+the number of warmup iterations.
+
+11. Fixed an error (`short jump is out of range`) that occurred when assembling
+the 32-bit x86 SIMD extensions with NASM versions prior to 2.04. This was a
+regression introduced by 1.5 beta1[12].
+
+
+1.5.1
+=====
+
+### Significant changes relative to 1.5.0:
+
+1. Previously, the undocumented `JSIMD_FORCE*` environment variables could be
+used to force-enable a particular SIMD instruction set if multiple instruction
+sets were available on a particular platform. On x86 platforms, where CPU
+feature detection is bulletproof and multiple SIMD instruction sets are
+available, it makes sense for those environment variables to allow forcing the
+use of an instruction set only if that instruction set is available. However,
+since the ARM implementations of libjpeg-turbo can only use one SIMD
+instruction set, and since their feature detection code is less bulletproof
+(parsing /proc/cpuinfo), it makes sense for the `JSIMD_FORCENEON` environment
+variable to bypass the feature detection code and really force the use of NEON
+instructions. A new environment variable (`JSIMD_FORCEDSPR2`) was introduced
+in the MIPS implementation for the same reasons, and the existing
+`JSIMD_FORCENONE` environment variable was extended to that implementation.
+These environment variables provide a workaround for those attempting to test
+ARM and MIPS builds of libjpeg-turbo in QEMU, which passes through
+/proc/cpuinfo from the host system.
+
+2. libjpeg-turbo previously assumed that AltiVec instructions were always
+available on PowerPC platforms, which led to "illegal instruction" errors when
+running on PowerPC chips that lack AltiVec support (such as the older 7xx/G3
+and newer e5500 series.) libjpeg-turbo now examines /proc/cpuinfo on
+Linux/Android systems and enables AltiVec instructions only if the CPU supports
+them. It also now provides two environment variables, `JSIMD_FORCEALTIVEC` and
+`JSIMD_FORCENONE`, to force-enable and force-disable AltiVec instructions in
+environments where /proc/cpuinfo is an unreliable means of CPU feature
+detection (such as when running in QEMU.) On OS X, libjpeg-turbo continues to
+assume that AltiVec support is always available, which means that libjpeg-turbo
+cannot be used with G3 Macs unless you set the environment variable
+`JSIMD_FORCENONE` to `1`.
+
+3. Fixed an issue whereby 64-bit ARM (AArch64) builds of libjpeg-turbo would
+crash when built with recent releases of the Clang/LLVM compiler. This was
+caused by an ABI conformance issue in some of libjpeg-turbo's 64-bit NEON SIMD
+routines. Those routines were incorrectly using 64-bit instructions to
+transfer a 32-bit JDIMENSION argument, whereas the ABI allows the upper
+(unused) 32 bits of a 32-bit argument's register to be undefined. The new
+Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
+structure members into a single 64-bit register, and this exposed the ABI
+conformance issue.
+
+4. Fancy upsampling is now supported when decompressing JPEG images that use
+4:4:0 (h1v2) chroma subsampling. These images are generated when losslessly
+rotating or transposing JPEG images that use 4:2:2 (h2v1) chroma subsampling.
+The h1v2 fancy upsampling algorithm is not currently SIMD-accelerated.
+
+5. If merged upsampling isn't SIMD-accelerated but YCbCr-to-RGB conversion is,
+then libjpeg-turbo will now disable merged upsampling when decompressing YCbCr
+JPEG images into RGB or extended RGB output images. This significantly speeds
+up the decompression of 4:2:0 and 4:2:2 JPEGs on ARM platforms if fancy
+upsampling is not used (for example, if the `-nosmooth` option to djpeg is
+specified.)
+
+6. The TurboJPEG API will now decompress 4:2:2 and 4:4:0 JPEG images with
+2x2 luminance sampling factors and 2x1 or 1x2 chrominance sampling factors.
+This is a non-standard way of specifying 2x subsampling (normally 4:2:2 JPEGs
+have 2x1 luminance and 1x1 chrominance sampling factors, and 4:4:0 JPEGs have
+1x2 luminance and 1x1 chrominance sampling factors), but the JPEG format and
+the libjpeg API both allow it.
+
+7. Fixed an unsigned integer overflow in the libjpeg memory manager, detected
+by the Clang undefined behavior sanitizer, that could be triggered by
+attempting to decompress a specially-crafted malformed JPEG image. This issue
+affected only 32-bit code and did not pose a security threat, but removing the
+warning makes it easier to detect actual security issues, should they arise in
+the future.
+
+8. Fixed additional negative left shifts and other issues reported by the GCC
+and Clang undefined behavior sanitizers when attempting to decompress
+specially-crafted malformed JPEG images. None of these issues posed a security
+threat, but removing the warnings makes it easier to detect actual security
+issues, should they arise in the future.
+
+9. Fixed an out-of-bounds array reference, introduced by 1.4.90[2] (partial
+image decompression) and detected by the Clang undefined behavior sanitizer,
+that could be triggered by a specially-crafted malformed JPEG image with more
+than four components. Because the out-of-bounds reference was still within the
+same structure, it was not known to pose a security threat, but removing the
+warning makes it easier to detect actual security issues, should they arise in
+the future.
+
+10. Fixed another ABI conformance issue in the 64-bit ARM (AArch64) NEON SIMD
+code. Some of the routines were incorrectly reading and storing data below the
+stack pointer, which caused segfaults in certain applications under specific
+circumstances.
+
+
+1.5.0
+=====
+
+### Significant changes relative to 1.5 beta1:
+
+1. Fixed an issue whereby a malformed motion-JPEG frame could cause the "fast
+path" of libjpeg-turbo's Huffman decoder to read from uninitialized memory.
+
+2. Added libjpeg-turbo version and build information to the global string table
+of the libjpeg and TurboJPEG API libraries. This is a common practice in other
+infrastructure libraries, such as OpenSSL and libpng, because it makes it easy
+to examine an application binary and determine which version of the library the
+application was linked against.
+
+3. Fixed a couple of issues in the PPM reader that would cause buffer overruns
+in cjpeg if one of the values in a binary PPM/PGM input file exceeded the
+maximum value defined in the file's header and that maximum value was greater
+than 255. libjpeg-turbo 1.4.2 already included a similar fix for ASCII PPM/PGM
+files. Note that these issues were not security bugs, since they were confined
+to the cjpeg program and did not affect any of the libjpeg-turbo libraries.
+
+4. Fixed an issue whereby attempting to decompress a JPEG file with a corrupt
+header using the `tjDecompressToYUV2()` function would cause the function to
+abort without returning an error and, under certain circumstances, corrupt the
+stack. This only occurred if `tjDecompressToYUV2()` was called prior to
+calling `tjDecompressHeader3()`, or if the return value from
+`tjDecompressHeader3()` was ignored (both cases represent incorrect usage of
+the TurboJPEG API.)
+
+5. Fixed an issue in the ARM 32-bit SIMD-accelerated Huffman encoder that
+prevented the code from assembling properly with clang.
+
+6. The `jpeg_stdio_src()`, `jpeg_mem_src()`, `jpeg_stdio_dest()`, and
+`jpeg_mem_dest()` functions in the libjpeg API will now throw an error if a
+source/destination manager has already been assigned to the compress or
+decompress object by a different function or by the calling program. This
+prevents these functions from attempting to reuse a source/destination manager
+structure that was allocated elsewhere, because there is no way to ensure that
+it would be big enough to accommodate the new source/destination manager.
+
+
+1.4.90 (1.5 beta1)
+==================
+
+### Significant changes relative to 1.4.2:
+
+1. Added full SIMD acceleration for PowerPC platforms using AltiVec VMX
+(128-bit SIMD) instructions. Although the performance of libjpeg-turbo on
+PowerPC was already good, due to the increased number of registers available
+to the compiler vs. x86, it was still possible to speed up compression by about
+3-4x and decompression by about 2-2.5x (relative to libjpeg v6b) through the
+use of AltiVec instructions.
+
+2. Added two new libjpeg API functions (`jpeg_skip_scanlines()` and
+`jpeg_crop_scanline()`) that can be used to partially decode a JPEG image. See
+[libjpeg.txt](libjpeg.txt) for more details.
+
+3. The TJCompressor and TJDecompressor classes in the TurboJPEG Java API now
+implement the Closeable interface, so those classes can be used with a
+try-with-resources statement.
+
+4. The TurboJPEG Java classes now throw unchecked idiomatic exceptions
+(IllegalArgumentException, IllegalStateException) for unrecoverable errors
+caused by incorrect API usage, and those classes throw a new checked exception
+type (TJException) for errors that are passed through from the C library.
+
+5. Source buffers for the TurboJPEG C API functions, as well as the
+`jpeg_mem_src()` function in the libjpeg API, are now declared as const
+pointers. This facilitates passing read-only buffers to those functions and
+ensures the caller that the source buffer will not be modified. This should
+not create any backward API or ABI incompatibilities with prior libjpeg-turbo
+releases.
+
+6. The MIPS DSPr2 SIMD code can now be compiled to support either FR=0 or FR=1
+FPUs.
+
+7. Fixed additional negative left shifts and other issues reported by the GCC
+and Clang undefined behavior sanitizers. Most of these issues affected only
+32-bit code, and none of them was known to pose a security threat, but removing
+the warnings makes it easier to detect actual security issues, should they
+arise in the future.
+
+8. Removed the unnecessary `.arch` directive from the ARM64 NEON SIMD code.
+This directive was preventing the code from assembling using the clang
+integrated assembler.
+
+9. Fixed a regression caused by 1.4.1[6] that prevented 32-bit and 64-bit
+libjpeg-turbo RPMs from being installed simultaneously on recent Red Hat/Fedora
+distributions. This was due to the addition of a macro in jconfig.h that
+allows the Huffman codec to determine the word size at compile time. Since
+that macro differs between 32-bit and 64-bit builds, this caused a conflict
+between the i386 and x86_64 RPMs (any differing files, other than executables,
+are not allowed when 32-bit and 64-bit RPMs are installed simultaneously.)
+Since the macro is used only internally, it has been moved into jconfigint.h.
+
+10. The x86-64 SIMD code can now be disabled at run time by setting the
+`JSIMD_FORCENONE` environment variable to `1` (the other SIMD implementations
+already had this capability.)
+
+11. Added a new command-line argument to TJBench (`-nowrite`) that prevents the
+benchmark from outputting any images. This removes any potential operating
+system overhead that might be caused by lazy writes to disk and thus improves
+the consistency of the performance measurements.
+
+12. Added SIMD acceleration for Huffman encoding on SSE2-capable x86 and x86-64
+platforms. This speeds up the compression of full-color JPEGs by about 10-15%
+on average (relative to libjpeg-turbo 1.4.x) when using modern Intel and AMD
+CPUs. Additionally, this works around an issue in the clang optimizer that
+prevents it (as of this writing) from achieving the same performance as GCC
+when compiling the C version of the Huffman encoder
+(<https://llvm.org/bugs/show_bug.cgi?id=16035>). For the purposes of
+benchmarking or regression testing, SIMD-accelerated Huffman encoding can be
+disabled by setting the `JSIMD_NOHUFFENC` environment variable to `1`.
+
+13. Added ARM 64-bit (ARMv8) NEON SIMD implementations of the commonly-used
+compression algorithms (including the accurate integer forward DCT and h2v2 &
+h2v1 downsampling algorithms, which are not accelerated in the 32-bit NEON
+implementation.) This speeds up the compression of full-color JPEGs by about
+75% on average on a Cavium ThunderX processor and by about 2-2.5x on average on
+Cortex-A53 and Cortex-A57 cores.
+
+14. Added SIMD acceleration for Huffman encoding on NEON-capable ARM 32-bit
+and 64-bit platforms.
+
+ For 32-bit code, this speeds up the compression of full-color JPEGs by
+about 30% on average on a typical iOS device (iPhone 4S, Cortex-A9) and by
+about 6-7% on average on a typical Android device (Nexus 5X, Cortex-A53 and
+Cortex-A57), relative to libjpeg-turbo 1.4.x. Note that the larger speedup
+under iOS is due to the fact that iOS builds use LLVM, which does not optimize
+the C Huffman encoder as well as GCC does.
+
+ For 64-bit code, NEON-accelerated Huffman encoding speeds up the
+compression of full-color JPEGs by about 40% on average on a typical iOS device
+(iPhone 5S, Apple A7) and by about 7-8% on average on a typical Android device
+(Nexus 5X, Cortex-A53 and Cortex-A57), in addition to the speedup described in
+[13] above.
+
+ For the purposes of benchmarking or regression testing, SIMD-accelerated
+Huffman encoding can be disabled by setting the `JSIMD_NOHUFFENC` environment
+variable to `1`.
+
+15. pkg-config (.pc) scripts are now included for both the libjpeg and
+TurboJPEG API libraries on Un*x systems. Note that if a project's build system
+relies on these scripts, then it will not be possible to build that project
+with libjpeg or with a prior version of libjpeg-turbo.
+
+16. Optimized the ARM 64-bit (ARMv8) NEON SIMD decompression routines to
+improve performance on CPUs with in-order pipelines. This speeds up the
+decompression of full-color JPEGs by nearly 2x on average on a Cavium ThunderX
+processor and by about 15% on average on a Cortex-A53 core.
+
+17. Fixed an issue in the accelerated Huffman decoder that could have caused
+the decoder to read past the end of the input buffer when a malformed,
+specially-crafted JPEG image was being decompressed. In prior versions of
+libjpeg-turbo, the accelerated Huffman decoder was invoked (in most cases) only
+if there were > 128 bytes of data in the input buffer. However, it is possible
+to construct a JPEG image in which a single Huffman block is over 430 bytes
+long, so this version of libjpeg-turbo activates the accelerated Huffman
+decoder only if there are > 512 bytes of data in the input buffer.
+
+18. Fixed a memory leak in tjunittest encountered when running the program
+with the `-yuv` option.
+
+
+1.4.2
+=====
+
+### Significant changes relative to 1.4.1:
+
+1. Fixed an issue whereby cjpeg would segfault if a Windows bitmap with a
+negative width or height was used as an input image (Windows bitmaps can have
+a negative height if they are stored in top-down order, but such files are
+rare and not supported by libjpeg-turbo.)
+
+2. Fixed an issue whereby, under certain circumstances, libjpeg-turbo would
+incorrectly encode certain JPEG images when quality=100 and the fast integer
+forward DCT were used. This was known to cause `make test` to fail when the
+library was built with `-march=haswell` on x86 systems.
+
+3. Fixed an issue whereby libjpeg-turbo would crash when built with the latest
+& greatest development version of the Clang/LLVM compiler. This was caused by
+an x86-64 ABI conformance issue in some of libjpeg-turbo's 64-bit SSE2 SIMD
+routines. Those routines were incorrectly using a 64-bit `mov` instruction to
+transfer a 32-bit JDIMENSION argument, whereas the x86-64 ABI allows the upper
+(unused) 32 bits of a 32-bit argument's register to be undefined. The new
+Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
+structure members into a single 64-bit register, and this exposed the ABI
+conformance issue.
+
+4. Fixed a bug in the MIPS DSPr2 4:2:0 "plain" (non-fancy and non-merged)
+upsampling routine that caused a buffer overflow (and subsequent segfault) when
+decompressing a 4:2:0 JPEG image whose scaled output width was less than 16
+pixels. The "plain" upsampling routines are normally only used when
+decompressing a non-YCbCr JPEG image, but they are also used when decompressing
+a JPEG image whose scaled output height is 1.
+
+5. Fixed various negative left shifts and other issues reported by the GCC and
+Clang undefined behavior sanitizers. None of these was known to pose a
+security threat, but removing the warnings makes it easier to detect actual
+security issues, should they arise in the future.
+
+
+1.4.1
+=====
+
+### Significant changes relative to 1.4.0:
+
+1. tjbench now properly handles CMYK/YCCK JPEG files. Passing an argument of
+`-cmyk` (instead of, for instance, `-rgb`) will cause tjbench to internally
+convert the source bitmap to CMYK prior to compression, to generate YCCK JPEG
+files, and to internally convert the decompressed CMYK pixels back to RGB after
+decompression (the latter is done automatically if a CMYK or YCCK JPEG is
+passed to tjbench as a source image.) The CMYK<->RGB conversion operation is
+not benchmarked. NOTE: The quick & dirty CMYK<->RGB conversions that tjbench
+uses are suitable for testing only. Proper conversion between CMYK and RGB
+requires a color management system.
+
+2. `make test` now performs additional bitwise regression tests using tjbench,
+mainly for the purpose of testing compression from/decompression to a subregion
+of a larger image buffer.
+
+3. `make test` no longer tests the regression of the floating point DCT/IDCT
+by default, since the results of those tests can vary if the algorithms in
+question are not implemented using SIMD instructions on a particular platform.
+See the comments in [Makefile.am](Makefile.am) for information on how to
+re-enable the tests and to specify an expected result for them based on the
+particulars of your platform.
+
+4. The NULL color conversion routines have been significantly optimized,
+which speeds up the compression of RGB and CMYK JPEGs by 5-20% when using
+64-bit code and 0-3% when using 32-bit code, and the decompression of those
+images by 10-30% when using 64-bit code and 3-12% when using 32-bit code.
+
+5. Fixed an "illegal instruction" error that occurred when djpeg from a
+SIMD-enabled libjpeg-turbo MIPS build was executed with the `-nosmooth` option
+on a MIPS machine that lacked DSPr2 support. The MIPS SIMD routines for h2v1
+and h2v2 merged upsampling were not properly checking for the existence of
+DSPr2.
+
+6. Performance has been improved significantly on 64-bit non-Linux and
+non-Windows platforms (generally 10-20% faster compression and 5-10% faster
+decompression.) Due to an oversight, the 64-bit version of the accelerated
+Huffman codec was not being compiled in when libjpeg-turbo was built on
+platforms other than Windows or Linux. Oops.
+
+7. Fixed an extremely rare bug in the Huffman encoder that caused 64-bit
+builds of libjpeg-turbo to incorrectly encode a few specific test images when
+quality=98, an optimized Huffman table, and the accurate integer forward DCT
+were used.
+
+8. The Windows (CMake) build system now supports building only static or only
+shared libraries. This is accomplished by adding either `-DENABLE_STATIC=0` or
+`-DENABLE_SHARED=0` to the CMake command line.
+
+9. TurboJPEG API functions will now return an error code if a warning is
+triggered in the underlying libjpeg API. For instance, if a JPEG file is
+corrupt, the TurboJPEG decompression functions will attempt to decompress
+as much of the image as possible, but those functions will now return -1 to
+indicate that the decompression was not entirely successful.
+
+10. Fixed a bug in the MIPS DSPr2 4:2:2 fancy upsampling routine that caused a
+buffer overflow (and subsequent segfault) when decompressing a 4:2:2 JPEG image
+in which the right-most MCU was 5 or 6 pixels wide.
+
+
+1.4.0
+=====
+
+### Significant changes relative to 1.4 beta1:
+
+1. Fixed a build issue on OS X PowerPC platforms (md5cmp failed to build
+because OS X does not provide the `le32toh()` and `htole32()` functions.)
+
+2. The non-SIMD RGB565 color conversion code did not work correctly on big
+endian machines. This has been fixed.
+
+3. Fixed an issue in `tjPlaneSizeYUV()` whereby it would erroneously return 1
+instead of -1 if `componentID` was > 0 and `subsamp` was `TJSAMP_GRAY`.
+
+3. Fixed an issue in `tjBufSizeYUV2()` whereby it would erroneously return 0
+instead of -1 if `width` was < 1.
+
+5. The Huffman encoder now uses `clz` and `bsr` instructions for bit counting
+on ARM64 platforms (see 1.4 beta1[5].)
+
+6. The `close()` method in the TJCompressor and TJDecompressor Java classes is
+now idempotent. Previously, that method would call the native `tjDestroy()`
+function even if the TurboJPEG instance had already been destroyed. This
+caused an exception to be thrown during finalization, if the `close()` method
+had already been called. The exception was caught, but it was still an
+expensive operation.
+
+7. The TurboJPEG API previously generated an error (`Could not determine
+subsampling type for JPEG image`) when attempting to decompress grayscale JPEG
+images that were compressed with a sampling factor other than 1 (for instance,
+with `cjpeg -grayscale -sample 2x2`). Subsampling technically has no meaning
+with grayscale JPEGs, and thus the horizontal and vertical sampling factors
+for such images are ignored by the decompressor. However, the TurboJPEG API
+was being too rigid and was expecting the sampling factors to be equal to 1
+before it treated the image as a grayscale JPEG.
+
+8. cjpeg, djpeg, and jpegtran now accept an argument of `-version`, which will
+print the library version and exit.
+
+9. Referring to 1.4 beta1[15], another extremely rare circumstance was
+discovered under which the Huffman encoder's local buffer can be overrun
+when a buffered destination manager is being used and an
+extremely-high-frequency block (basically junk image data) is being encoded.
+Even though the Huffman local buffer was increased from 128 bytes to 136 bytes
+to address the previous issue, the new issue caused even the larger buffer to
+be overrun. Further analysis reveals that, in the absolute worst case (such as
+setting alternating AC coefficients to 32767 and -32768 in the JPEG scanning
+order), the Huffman encoder can produce encoded blocks that approach double the
+size of the unencoded blocks. Thus, the Huffman local buffer was increased to
+256 bytes, which should prevent any such issue from re-occurring in the future.
+
+10. The new `tjPlaneSizeYUV()`, `tjPlaneWidth()`, and `tjPlaneHeight()`
+functions were not actually usable on any platform except OS X and Windows,
+because those functions were not included in the libturbojpeg mapfile. This
+has been fixed.
+
+11. Restored the `JPP()`, `JMETHOD()`, and `FAR` macros in the libjpeg-turbo
+header files. The `JPP()` and `JMETHOD()` macros were originally implemented
+in libjpeg as a way of supporting non-ANSI compilers that lacked support for
+prototype parameters. libjpeg-turbo has never supported such compilers, but
+some software packages still use the macros to define their own prototypes.
+Similarly, libjpeg-turbo has never supported MS-DOS and other platforms that
+have far symbols, but some software packages still use the `FAR` macro. A
+pretty good argument can be made that this is a bad practice on the part of the
+software in question, but since this affects more than one package, it's just
+easier to fix it here.
+
+12. Fixed issues that were preventing the ARM 64-bit SIMD code from compiling
+for iOS, and included an ARMv8 architecture in all of the binaries installed by
+the "official" libjpeg-turbo SDK for OS X.
+
+
+1.3.90 (1.4 beta1)
+==================
+
+### Significant changes relative to 1.3.1:
+
+1. New features in the TurboJPEG API:
+
+ - YUV planar images can now be generated with an arbitrary line padding
+(previously only 4-byte padding, which was compatible with X Video, was
+supported.)
+ - The decompress-to-YUV function has been extended to support image
+scaling.
+ - JPEG images can now be compressed from YUV planar source images.
+ - YUV planar images can now be decoded into RGB or grayscale images.
+ - 4:1:1 subsampling is now supported. This is mainly included for
+compatibility, since 4:1:1 is not fully accelerated in libjpeg-turbo and has no
+significant advantages relative to 4:2:0.
+ - CMYK images are now supported. This feature allows CMYK source images
+to be compressed to YCCK JPEGs and YCCK or CMYK JPEGs to be decompressed to
+CMYK destination images. Conversion between CMYK/YCCK and RGB or YUV images is
+not supported. Such conversion requires a color management system and is thus
+out of scope for a codec library.
+ - The handling of YUV images in the Java API has been significantly
+refactored and should now be much more intuitive.
+ - The Java API now supports encoding a YUV image from an arbitrary
+position in a large image buffer.
+ - All of the YUV functions now have a corresponding function that operates
+on separate image planes instead of a unified image buffer. This allows for
+compressing/decoding from or decompressing/encoding to a subregion of a larger
+YUV image. It also allows for handling YUV formats that swap the order of the
+U and V planes.
+
+2. Added SIMD acceleration for DSPr2-capable MIPS platforms. This speeds up
+the compression of full-color JPEGs by 70-80% on such platforms and
+decompression by 25-35%.
+
+3. If an application attempts to decompress a Huffman-coded JPEG image whose
+header does not contain Huffman tables, libjpeg-turbo will now insert the
+default Huffman tables. In order to save space, many motion JPEG video frames
+are encoded without the default Huffman tables, so these frames can now be
+successfully decompressed by libjpeg-turbo without additional work on the part
+of the application. An application can still override the Huffman tables, for
+instance to re-use tables from a previous frame of the same video.
+
+4. The Mac packaging system now uses pkgbuild and productbuild rather than
+PackageMaker (which is obsolete and no longer supported.) This means that
+OS X 10.6 "Snow Leopard" or later must be used when packaging libjpeg-turbo,
+although the packages produced can be installed on OS X 10.5 "Leopard" or
+later. OS X 10.4 "Tiger" is no longer supported.
+
+5. The Huffman encoder now uses `clz` and `bsr` instructions for bit counting
+on ARM platforms rather than a lookup table. This reduces the memory footprint
+by 64k, which may be important for some mobile applications. Out of four
+Android devices that were tested, two demonstrated a small overall performance
+loss (~3-4% on average) with ARMv6 code and a small gain (also ~3-4%) with
+ARMv7 code when enabling this new feature, but the other two devices
+demonstrated a significant overall performance gain with both ARMv6 and ARMv7
+code (~10-20%) when enabling the feature. Actual mileage may vary.
+
+6. Worked around an issue with Visual C++ 2010 and later that caused incorrect
+pixels to be generated when decompressing a JPEG image to a 256-color bitmap,
+if compiler optimization was enabled when libjpeg-turbo was built. This caused
+the regression tests to fail when doing a release build under Visual C++ 2010
+and later.
+
+7. Improved the accuracy and performance of the non-SIMD implementation of the
+floating point inverse DCT (using code borrowed from libjpeg v8a and later.)
+The accuracy of this implementation now matches the accuracy of the SSE/SSE2
+implementation. Note, however, that the floating point DCT/IDCT algorithms are
+mainly a legacy feature. They generally do not produce significantly better
+accuracy than the accurate integer DCT/IDCT algorithms, and they are quite a
+bit slower.
+
+8. Added a new output colorspace (`JCS_RGB565`) to the libjpeg API that allows
+for decompressing JPEG images into RGB565 (16-bit) pixels. If dithering is not
+used, then this code path is SIMD-accelerated on ARM platforms.
+
+9. Numerous obsolete features, such as support for non-ANSI compilers and
+support for the MS-DOS memory model, were removed from the libjpeg code,
+greatly improving its readability and making it easier to maintain and extend.
+
+10. Fixed a segfault that occurred when calling `output_message()` with
+`msg_code` set to `JMSG_COPYRIGHT`.
+
+11. Fixed an issue whereby wrjpgcom was allowing comments longer than 65k
+characters to be passed on the command line, which was causing it to generate
+incorrect JPEG files.
+
+12. Fixed a bug in the build system that was causing the Windows version of
+wrjpgcom to be built using the rdjpgcom source code.
+
+13. Restored 12-bit-per-component JPEG support. A 12-bit version of
+libjpeg-turbo can now be built by passing an argument of `--with-12bit` to
+configure (Unix) or `-DWITH_12BIT=1` to cmake (Windows.) 12-bit JPEG support
+is included only for convenience. Enabling this feature disables all of the
+performance features in libjpeg-turbo, as well as arithmetic coding and the
+TurboJPEG API. The resulting library still contains the other libjpeg-turbo
+features (such as the colorspace extensions), but in general, it performs no
+faster than libjpeg v6b.
+
+14. Added ARM 64-bit SIMD acceleration for the YCC-to-RGB color conversion
+and IDCT algorithms (both are used during JPEG decompression.) For unknown
+reasons (probably related to clang), this code cannot currently be compiled for
+iOS.
+
+15. Fixed an extremely rare bug (CVE-2014-9092) that could cause the Huffman
+encoder's local buffer to overrun when a very high-frequency MCU is compressed
+using quality 100 and no subsampling, and when the JPEG output buffer is being
+dynamically resized by the destination manager. This issue was so rare that,
+even with a test program specifically designed to make the bug occur (by
+injecting random high-frequency YUV data into the compressor), it was
+reproducible only once in about every 25 million iterations.
+
+16. Fixed an oversight in the TurboJPEG C wrapper: if any of the JPEG
+compression functions was called repeatedly with the same
+automatically-allocated destination buffer, then TurboJPEG would erroneously
+assume that the `jpegSize` parameter was equal to the size of the buffer, when
+in fact that parameter was probably equal to the size of the most recently
+compressed JPEG image. If the size of the previous JPEG image was not as large
+as the current JPEG image, then TurboJPEG would unnecessarily reallocate the
+destination buffer.
+
+
+1.3.1
+=====
+
+### Significant changes relative to 1.3.0:
+
+1. On Un*x systems, `make install` now installs the libjpeg-turbo libraries
+into /opt/libjpeg-turbo/lib32 by default on any 32-bit system, not just x86,
+and into /opt/libjpeg-turbo/lib64 by default on any 64-bit system, not just
+x86-64. You can override this by overriding either the `prefix` or `libdir`
+configure variables.
+
+2. The Windows installer now places a copy of the TurboJPEG DLLs in the same
+directory as the rest of the libjpeg-turbo binaries. This was mainly done
+to support TurboVNC 1.3, which bundles the DLLs in its Windows installation.
+When using a 32-bit version of CMake on 64-bit Windows, it is impossible to
+access the c:\WINDOWS\system32 directory, which made it impossible for the
+TurboVNC build scripts to bundle the 64-bit TurboJPEG DLL.
+
+3. Fixed a bug whereby attempting to encode a progressive JPEG with arithmetic
+entropy coding (by passing arguments of `-progressive -arithmetic` to cjpeg or
+jpegtran, for instance) would result in an error, `Requested feature was
+omitted at compile time`.
+
+4. Fixed a couple of issues (CVE-2013-6629 and CVE-2013-6630) whereby malformed
+JPEG images would cause libjpeg-turbo to use uninitialized memory during
+decompression.
+
+5. Fixed an error (`Buffer passed to JPEG library is too small`) that occurred
+when calling the TurboJPEG YUV encoding function with a very small (< 5x5)
+source image, and added a unit test to check for this error.
+
+6. The Java classes should now build properly under Visual Studio 2010 and
+later.
+
+7. Fixed an issue that prevented SRPMs generated using the in-tree packaging
+tools from being rebuilt on certain newer Linux distributions.
+
+8. Numerous minor fixes to eliminate compilation and build/packaging system
+warnings, fix cosmetic issues, improve documentation clarity, and other general
+source cleanup.
+
+
+1.3.0
+=====
+
+### Significant changes relative to 1.3 beta1:
+
+1. `make test` now works properly on FreeBSD, and it no longer requires the
+md5sum executable to be present on other Un*x platforms.
+
+2. Overhauled the packaging system:
+
+ - To avoid conflict with vendor-supplied libjpeg-turbo packages, the
+official RPMs and DEBs for libjpeg-turbo have been renamed to
+"libjpeg-turbo-official".
+ - The TurboJPEG libraries are now located under /opt/libjpeg-turbo in the
+official Linux and Mac packages, to avoid conflict with vendor-supplied
+packages and also to streamline the packaging system.
+ - Release packages are now created with the directory structure defined
+by the configure variables `prefix`, `bindir`, `libdir`, etc. (Un\*x) or by the
+`CMAKE_INSTALL_PREFIX` variable (Windows.) The exception is that the docs are
+always located under the system default documentation directory on Un\*x and
+Mac systems, and on Windows, the TurboJPEG DLL is always located in the Windows
+system directory.
+ - To avoid confusion, official libjpeg-turbo packages on Linux/Unix
+platforms (except for Mac) will always install the 32-bit libraries in
+/opt/libjpeg-turbo/lib32 and the 64-bit libraries in /opt/libjpeg-turbo/lib64.
+ - Fixed an issue whereby, in some cases, the libjpeg-turbo executables on
+Un*x systems were not properly linking with the shared libraries installed by
+the same package.
+ - Fixed an issue whereby building the "installer" target on Windows when
+`WITH_JAVA=1` would fail if the TurboJPEG JAR had not been previously built.
+ - Building the "install" target on Windows now installs files into the
+same places that the installer does.
+
+3. Fixed a Huffman encoder bug that prevented I/O suspension from working
+properly.
+
+
+1.2.90 (1.3 beta1)
+==================
+
+### Significant changes relative to 1.2.1:
+
+1. Added support for additional scaling factors (3/8, 5/8, 3/4, 7/8, 9/8, 5/4,
+11/8, 3/2, 13/8, 7/4, 15/8, and 2) when decompressing. Note that the IDCT will
+not be SIMD-accelerated when using any of these new scaling factors.
+
+2. The TurboJPEG dynamic library is now versioned. It was not strictly
+necessary to do so, because TurboJPEG uses versioned symbols, and if a function
+changes in an ABI-incompatible way, that function is renamed and a legacy
+function is provided to maintain backward compatibility. However, certain
+Linux distro maintainers have a policy against accepting any library that isn't
+versioned.
+
+3. Extended the TurboJPEG Java API so that it can be used to compress a JPEG
+image from and decompress a JPEG image to an arbitrary position in a large
+image buffer.
+
+4. The `tjDecompressToYUV()` function now supports the `TJFLAG_FASTDCT` flag.
+
+5. The 32-bit supplementary package for amd64 Debian systems now provides
+symlinks in /usr/lib/i386-linux-gnu for the TurboJPEG libraries in /usr/lib32.
+This allows those libraries to be used on MultiArch-compatible systems (such as
+Ubuntu 11 and later) without setting the linker path.
+
+6. The TurboJPEG Java wrapper should now find the JNI library on Mac systems
+without having to pass `-Djava.library.path=/usr/lib` to java.
+
+7. TJBench has been ported to Java to provide a convenient way of validating
+the performance of the TurboJPEG Java API. It can be run with
+`java -cp turbojpeg.jar TJBench`.
+
+8. cjpeg can now be used to generate JPEG files with the RGB colorspace
+(feature ported from jpeg-8d.)
+
+9. The width and height in the `-crop` argument passed to jpegtran can now be
+suffixed with `f` to indicate that, when the upper left corner of the cropping
+region is automatically moved to the nearest iMCU boundary, the bottom right
+corner should be moved by the same amount. In other words, this feature causes
+jpegtran to strictly honor the specified width/height rather than the specified
+bottom right corner (feature ported from jpeg-8d.)
+
+10. JPEG files using the RGB colorspace can now be decompressed into grayscale
+images (feature ported from jpeg-8d.)
+
+11. Fixed a regression caused by 1.2.1[7] whereby the build would fail with
+multiple "Mismatch in operand sizes" errors when attempting to build the x86
+SIMD code with NASM 0.98.
+
+12. The in-memory source/destination managers (`jpeg_mem_src()` and
+`jpeg_mem_dest()`) are now included by default when building libjpeg-turbo with
+libjpeg v6b or v7 emulation, so that programs can take advantage of these
+functions without requiring the use of the backward-incompatible libjpeg v8
+ABI. The "age number" of the libjpeg-turbo library on Un*x systems has been
+incremented by 1 to reflect this. You can disable this feature with a
+configure/CMake switch in order to retain strict API/ABI compatibility with the
+libjpeg v6b or v7 API/ABI (or with previous versions of libjpeg-turbo.) See
+[README.md](README.md) for more details.
+
+13. Added ARMv7s architecture to libjpeg.a and libturbojpeg.a in the official
+libjpeg-turbo binary package for OS X, so that those libraries can be used to
+build applications that leverage the faster CPUs in the iPhone 5 and iPad 4.
+
+
+1.2.1
+=====
+
+### Significant changes relative to 1.2.0:
+
+1. Creating or decoding a JPEG file that uses the RGB colorspace should now
+properly work when the input or output colorspace is one of the libjpeg-turbo
+colorspace extensions.
+
+2. When libjpeg-turbo was built without SIMD support and merged (non-fancy)
+upsampling was used along with an alpha-enabled colorspace during
+decompression, the unused byte of the decompressed pixels was not being set to
+0xFF. This has been fixed. TJUnitTest has also been extended to test for the
+correct behavior of the colorspace extensions when merged upsampling is used.
+
+3. Fixed a bug whereby the libjpeg-turbo SSE2 SIMD code would not preserve the
+upper 64 bits of xmm6 and xmm7 on Win64 platforms, which violated the Win64
+calling conventions.
+
+4. Fixed a regression (CVE-2012-2806) caused by 1.2.0[6] whereby decompressing
+corrupt JPEG images (specifically, images in which the component count was
+erroneously set to a large value) would cause libjpeg-turbo to segfault.
+
+5. Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
+processors. The `MASKMOVDQU` instruction, which was used by the libjpeg-turbo
+SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
+it is painfully slow on Bobcat processors in particular. Eliminating the use
+of this instruction improved performance by an order of magnitude on Bobcat
+processors and by a small amount (typically 5%) on AMD desktop processors.
+
+6. Added SIMD acceleration for performing 4:2:2 upsampling on NEON-capable ARM
+platforms. This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
+platforms.
+
+7. Fixed a regression caused by 1.2.0[2] whereby, on Linux/x86 platforms
+running the 32-bit SSE2 SIMD code in libjpeg-turbo, decompressing a 4:2:0 or
+4:2:2 JPEG image into a 32-bit (RGBX, BGRX, etc.) buffer without using fancy
+upsampling would produce several incorrect columns of pixels at the right-hand
+side of the output image if each row in the output image was not evenly
+divisible by 16 bytes.
+
+8. Fixed an issue whereby attempting to build the SIMD extensions with Xcode
+4.3 on OS X platforms would cause NASM to return numerous errors of the form
+"'%define' expects a macro identifier".
+
+9. Added flags to the TurboJPEG API that allow the caller to force the use of
+either the fast or the accurate DCT/IDCT algorithms in the underlying codec.
+
+
+1.2.0
+=====
+
+### Significant changes relative to 1.2 beta1:
+
+1. Fixed build issue with Yasm on Unix systems (the libjpeg-turbo build system
+was not adding the current directory to the assembler include path, so Yasm
+was not able to find jsimdcfg.inc.)
+
+2. Fixed out-of-bounds read in SSE2 SIMD code that occurred when decompressing
+a JPEG image to a bitmap buffer whose size was not a multiple of 16 bytes.
+This was more of an annoyance than an actual bug, since it did not cause any
+actual run-time problems, but the issue showed up when running libjpeg-turbo in
+valgrind. See <http://crbug.com/72399> for more information.
+
+3. Added a compile-time macro (`LIBJPEG_TURBO_VERSION`) that can be used to
+check the version of libjpeg-turbo against which an application was compiled.
+
+4. Added new RGBA/BGRA/ABGR/ARGB colorspace extension constants (libjpeg API)
+and pixel formats (TurboJPEG API), which allow applications to specify that,
+when decompressing to a 4-component RGB buffer, the unused byte should be set
+to 0xFF so that it can be interpreted as an opaque alpha channel.
+
+5. Fixed regression issue whereby DevIL failed to build against libjpeg-turbo
+because libjpeg-turbo's distributed version of jconfig.h contained an `INLINE`
+macro, which conflicted with a similar macro in DevIL. This macro is used only
+internally when building libjpeg-turbo, so it was moved into config.h.
+
+6. libjpeg-turbo will now correctly decompress erroneous CMYK/YCCK JPEGs whose
+K component is assigned a component ID of 1 instead of 4. Although these files
+are in violation of the spec, other JPEG implementations handle them
+correctly.
+
+7. Added ARMv6 and ARMv7 architectures to libjpeg.a and libturbojpeg.a in
+the official libjpeg-turbo binary package for OS X, so that those libraries can
+be used to build both OS X and iOS applications.
+
+
+1.1.90 (1.2 beta1)
+==================
+
+### Significant changes relative to 1.1.1:
+
+1. Added a Java wrapper for the TurboJPEG API. See [java/README](java/README)
+for more details.
+
+2. The TurboJPEG API can now be used to scale down images during
+decompression.
+
+3. Added SIMD routines for RGB-to-grayscale color conversion, which
+significantly improves the performance of grayscale JPEG compression from an
+RGB source image.
+
+4. Improved the performance of the C color conversion routines, which are used
+on platforms for which SIMD acceleration is not available.
+
+5. Added a function to the TurboJPEG API that performs lossless transforms.
+This function is implemented using the same back end as jpegtran, but it
+performs transcoding entirely in memory and allows multiple transforms and/or
+crop operations to be batched together, so the source coefficients only need to
+be read once. This is useful when generating image tiles from a single source
+JPEG.
+
+6. Added tests for the new TurboJPEG scaled decompression and lossless
+transform features to tjbench (the TurboJPEG benchmark, formerly called
+"jpgtest".)
+
+7. Added support for 4:4:0 (transposed 4:2:2) subsampling in TurboJPEG, which
+was necessary in order for it to read 4:2:2 JPEG files that had been losslessly
+transposed or rotated 90 degrees.
+
+8. All legacy VirtualGL code has been re-factored, and this has allowed
+libjpeg-turbo, in its entirety, to be re-licensed under a BSD-style license.
+
+9. libjpeg-turbo can now be built with Yasm.
+
+10. Added SIMD acceleration for ARM Linux and iOS platforms that support
+NEON instructions.
+
+11. Refactored the TurboJPEG C API and documented it using Doxygen. The
+TurboJPEG 1.2 API uses pixel formats to define the size and component order of
+the uncompressed source/destination images, and it includes a more efficient
+version of `TJBUFSIZE()` that computes a worst-case JPEG size based on the
+level of chrominance subsampling. The refactored implementation of the
+TurboJPEG API now uses the libjpeg memory source and destination managers,
+which allows the TurboJPEG compressor to grow the JPEG buffer as necessary.
+
+12. Eliminated errors in the output of jpegtran on Windows that occurred when
+the application was invoked using I/O redirection
+(`jpegtran <input.jpg >output.jpg`.)
+
+13. The inclusion of libjpeg v7 and v8 emulation as well as arithmetic coding
+support in libjpeg-turbo v1.1.0 introduced several new error constants in
+jerror.h, and these were mistakenly enabled for all emulation modes, causing
+the error enum in libjpeg-turbo to sometimes have different values than the
+same enum in libjpeg. This represents an ABI incompatibility, and it caused
+problems with rare applications that took specific action based on a particular
+error value. The fix was to include the new error constants conditionally
+based on whether libjpeg v7 or v8 emulation was enabled.
+
+14. Fixed an issue whereby Windows applications that used libjpeg-turbo would
+fail to compile if the Windows system headers were included before jpeglib.h.
+This issue was caused by a conflict in the definition of the INT32 type.
+
+15. Fixed 32-bit supplementary package for amd64 Debian systems, which was
+broken by enhancements to the packaging system in 1.1.
+
+16. When decompressing a JPEG image using an output colorspace of
+`JCS_EXT_RGBX`, `JCS_EXT_BGRX`, `JCS_EXT_XBGR`, or `JCS_EXT_XRGB`,
+libjpeg-turbo will now set the unused byte to 0xFF, which allows applications
+to interpret that byte as an alpha channel (0xFF = opaque).
+
+
+1.1.1
+=====
+
+### Significant changes relative to 1.1.0:
+
+1. Fixed a 1-pixel error in row 0, column 21 of the luminance plane generated
+by `tjEncodeYUV()`.
+
+2. libjpeg-turbo's accelerated Huffman decoder previously ignored unexpected
+markers found in the middle of the JPEG data stream during decompression. It
+will now hand off decoding of a particular block to the unaccelerated Huffman
+decoder if an unexpected marker is found, so that the unaccelerated Huffman
+decoder can generate an appropriate warning.
+
+3. Older versions of MinGW64 prefixed symbol names with underscores by
+default, which differed from the behavior of 64-bit Visual C++. MinGW64 1.0
+has adopted the behavior of 64-bit Visual C++ as the default, so to accommodate
+this, the libjpeg-turbo SIMD function names are no longer prefixed with an
+underscore when building with MinGW64. This means that, when building
+libjpeg-turbo with older versions of MinGW64, you will now have to add
+`-fno-leading-underscore` to the `CFLAGS`.
+
+4. Fixed a regression bug in the NSIS script that caused the Windows installer
+build to fail when using the Visual Studio IDE.
+
+5. Fixed a bug in `jpeg_read_coefficients()` whereby it would not initialize
+`cinfo->image_width` and `cinfo->image_height` if libjpeg v7 or v8 emulation
+was enabled. This specifically caused the jpegoptim program to fail if it was
+linked against a version of libjpeg-turbo that was built with libjpeg v7 or v8
+emulation.
+
+6. Eliminated excessive I/O overhead that occurred when reading BMP files in
+cjpeg.
+
+7. Eliminated errors in the output of cjpeg on Windows that occurred when the
+application was invoked using I/O redirection (`cjpeg <inputfile >output.jpg`.)
+
+
+1.1.0
+=====
+
+### Significant changes relative to 1.1 beta1:
+
+1. The algorithm used by the SIMD quantization function cannot produce correct
+results when the JPEG quality is >= 98 and the fast integer forward DCT is
+used. Thus, the non-SIMD quantization function is now used for those cases,
+and libjpeg-turbo should now produce identical output to libjpeg v6b in all
+cases.
+
+2. Despite the above, the fast integer forward DCT still degrades somewhat for
+JPEG qualities greater than 95, so the TurboJPEG wrapper will now automatically
+use the accurate integer forward DCT when generating JPEG images of quality 96
+or greater. This reduces compression performance by as much as 15% for these
+high-quality images but is necessary to ensure that the images are perceptually
+lossless. It also ensures that the library can avoid the performance pitfall
+created by [1].
+
+3. Ported jpgtest.cxx to pure C to avoid the need for a C++ compiler.
+
+4. Fixed visual artifacts in grayscale JPEG compression caused by a typo in
+the RGB-to-luminance lookup tables.
+
+5. The Windows distribution packages now include the libjpeg run-time programs
+(cjpeg, etc.)
+
+6. All packages now include jpgtest.
+
+7. The TurboJPEG dynamic library now uses versioned symbols.
+
+8. Added two new TurboJPEG API functions, `tjEncodeYUV()` and
+`tjDecompressToYUV()`, to replace the somewhat hackish `TJ_YUV` flag.
+
+
+1.0.90 (1.1 beta1)
+==================
+
+### Significant changes relative to 1.0.1:
+
+1. Added emulation of the libjpeg v7 and v8 APIs and ABIs. See
+[README.md](README.md) for more details. This feature was sponsored by
+CamTrace SAS.
+
+2. Created a new CMake-based build system for the Visual C++ and MinGW builds.
+
+3. Grayscale bitmaps can now be compressed from/decompressed to using the
+TurboJPEG API.
+
+4. jpgtest can now be used to test decompression performance with existing
+JPEG images.
+
+5. If the default install prefix (/opt/libjpeg-turbo) is used, then
+`make install` now creates /opt/libjpeg-turbo/lib32 and
+/opt/libjpeg-turbo/lib64 sym links to duplicate the behavior of the binary
+packages.
+
+6. All symbols in the libjpeg-turbo dynamic library are now versioned, even
+when the library is built with libjpeg v6b emulation.
+
+7. Added arithmetic encoding and decoding support (can be disabled with
+configure or CMake options)
+
+8. Added a `TJ_YUV` flag to the TurboJPEG API, which causes both the compressor
+and decompressor to output planar YUV images.
+
+9. Added an extended version of `tjDecompressHeader()` to the TurboJPEG API,
+which allows the caller to determine the type of subsampling used in a JPEG
+image.
+
+10. Added further protections against invalid Huffman codes.
+
+
+1.0.1
+=====
+
+### Significant changes relative to 1.0.0:
+
+1. The Huffman decoder will now handle erroneous Huffman codes (for instance,
+from a corrupt JPEG image.) Previously, these would cause libjpeg-turbo to
+crash under certain circumstances.
+
+2. Fixed typo in SIMD dispatch routines that was causing 4:2:2 upsampling to
+be used instead of 4:2:0 when decompressing JPEG images using SSE2 code.
+
+3. The configure script will now automatically determine whether the
+`INCOMPLETE_TYPES_BROKEN` macro should be defined.
+
+
+1.0.0
+=====
+
+### Significant changes relative to 0.0.93:
+
+1. 2983700: Further FreeBSD build tweaks (no longer necessary to specify
+`--host` when configuring on a 64-bit system)
+
+2. Created symlinks in the Unix/Linux packages so that the TurboJPEG
+include file can always be found in /opt/libjpeg-turbo/include, the 32-bit
+static libraries can always be found in /opt/libjpeg-turbo/lib32, and the
+64-bit static libraries can always be found in /opt/libjpeg-turbo/lib64.
+
+3. The Unix/Linux distribution packages now include the libjpeg run-time
+programs (cjpeg, etc.) and man pages.
+
+4. Created a 32-bit supplementary package for amd64 Debian systems, which
+contains just the 32-bit libjpeg-turbo libraries.
+
+5. Moved the libraries from */lib32 to */lib in the i386 Debian package.
+
+6. Include distribution package for Cygwin
+
+7. No longer necessary to specify `--without-simd` on non-x86 architectures,
+and unit tests now work on those architectures.
+
+
+0.0.93
+======
+
+### Significant changes since 0.0.91:
+
+1. 2982659: Fixed x86-64 build on FreeBSD systems
+
+2. 2988188: Added support for Windows 64-bit systems
+
+
+0.0.91
+======
+
+### Significant changes relative to 0.0.90:
+
+1. Added documentation to .deb packages
+
+2. 2968313: Fixed data corruption issues when decompressing large JPEG images
+and/or using buffered I/O with the libjpeg-turbo decompressor
+
+
+0.0.90
+======
+
+Initial release
diff --git a/media/libjpeg/LICENSE.md b/media/libjpeg/LICENSE.md
new file mode 100644
index 0000000000..bf8a7fda7f
--- /dev/null
+++ b/media/libjpeg/LICENSE.md
@@ -0,0 +1,132 @@
+libjpeg-turbo Licenses
+======================
+
+libjpeg-turbo is covered by three compatible BSD-style open source licenses:
+
+- The IJG (Independent JPEG Group) License, which is listed in
+ [README.ijg](README.ijg)
+
+ This license applies to the libjpeg API library and associated programs
+ (any code inherited from libjpeg, and any modifications to that code.)
+
+- The Modified (3-clause) BSD License, which is listed below
+
+ This license covers the TurboJPEG API library and associated programs, as
+ well as the build system.
+
+- The [zlib License](https://opensource.org/licenses/Zlib)
+
+ This license is a subset of the other two, and it covers the libjpeg-turbo
+ SIMD extensions.
+
+
+Complying with the libjpeg-turbo Licenses
+=========================================
+
+This section provides a roll-up of the libjpeg-turbo licensing terms, to the
+best of our understanding.
+
+1. If you are distributing a modified version of the libjpeg-turbo source,
+ then:
+
+ 1. You cannot alter or remove any existing copyright or license notices
+ from the source.
+
+ **Origin**
+ - Clause 1 of the IJG License
+ - Clause 1 of the Modified BSD License
+ - Clauses 1 and 3 of the zlib License
+
+ 2. You must add your own copyright notice to the header of each source
+ file you modified, so others can tell that you modified that file (if
+ there is not an existing copyright header in that file, then you can
+ simply add a notice stating that you modified the file.)
+
+ **Origin**
+ - Clause 1 of the IJG License
+ - Clause 2 of the zlib License
+
+ 3. You must include the IJG README file, and you must not alter any of the
+ copyright or license text in that file.
+
+ **Origin**
+ - Clause 1 of the IJG License
+
+2. If you are distributing only libjpeg-turbo binaries without the source, or
+ if you are distributing an application that statically links with
+ libjpeg-turbo, then:
+
+ 1. Your product documentation must include a message stating:
+
+ This software is based in part on the work of the Independent JPEG
+ Group.
+
+ **Origin**
+ - Clause 2 of the IJG license
+
+ 2. If your binary distribution includes or uses the TurboJPEG API, then
+ your product documentation must include the text of the Modified BSD
+ License (see below.)
+
+ **Origin**
+ - Clause 2 of the Modified BSD License
+
+3. You cannot use the name of the IJG or The libjpeg-turbo Project or the
+ contributors thereof in advertising, publicity, etc.
+
+ **Origin**
+ - IJG License
+ - Clause 3 of the Modified BSD License
+
+4. The IJG and The libjpeg-turbo Project do not warrant libjpeg-turbo to be
+ free of defects, nor do we accept any liability for undesirable
+ consequences resulting from your use of the software.
+
+ **Origin**
+ - IJG License
+ - Modified BSD License
+ - zlib License
+
+
+The Modified (3-clause) BSD License
+===================================
+
+Copyright (C)2009-2023 D. R. Commander. All Rights Reserved.<br>
+Copyright (C)2015 Viktor Szathmáry. All Rights Reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+- Neither the name of the libjpeg-turbo Project nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+Why Three Licenses?
+===================
+
+The zlib License could have been used instead of the Modified (3-clause) BSD
+License, and since the IJG License effectively subsumes the distribution
+conditions of the zlib License, this would have effectively placed
+libjpeg-turbo binary distributions under the IJG License. However, the IJG
+License specifically refers to the Independent JPEG Group and does not extend
+attribution and endorsement protections to other entities. Thus, it was
+desirable to choose a license that granted us the same protections for new code
+that were granted to the IJG for code derived from their software.
diff --git a/media/libjpeg/MOZCHANGES b/media/libjpeg/MOZCHANGES
new file mode 100644
index 0000000000..975cd385c5
--- /dev/null
+++ b/media/libjpeg/MOZCHANGES
@@ -0,0 +1,163 @@
+To upgrade to a new revision of libjpeg-turbo, do the following:
+
+* Check out libjpeg-turbo from git:
+
+ $ git clone https://github.com/libjpeg-turbo/libjpeg-turbo.git
+
+* In a clean clone of mozilla-central, run the update script (tag defaults to HEAD):
+
+ $ ./media/update-libjpeg.sh /path/to/libjpeg-turbo [tag]
+
+ and fix up any rejects from applying the Mozilla specific patches at the end
+ of that script.
+
+* Since libjpeg-turbo normally creates jconfig.h and jconfigint.h at build time
+ and we use pre-generated versions, changes to jconfig.h.in and jconfigint.h.in
+ should be looked for and noted for later inclusion.
+
+* Now look through the new files and rm any which are npotb. When I upgraded
+ to libjpeg-turbo 1.1.0, the only files I kept which didn't match
+
+ *.c *.h *.asm *.inc
+
+ were README and README-turbo.
+
+ You can easily look for all non *.c, *.h, *.asm, and *.inc files by running
+
+ $ hg status -nu | grep -v '\(c\|h\|asm\|inc\|md\|ijg\)$'
+
+ Once you're comfortable that you're only deleting files you want to delete
+ (and you've hg add'ed the files you want to keep), you can nuke the remaining
+ files with
+
+ $ hg status -nu | grep -v '\(c\|h\|asm\|inc\|md\|ijg\)$' | xargs rm
+
+ A helpful command for finding the *.c files which aren't *currently* part of
+ the build is
+
+ diff <(ls *.c | sort) <(grep -o '\w*\.c' Makefile.in | sort)
+
+ Of course, libjpeg-turbo might have added some new source files, so you'll
+ have to look though and figure out which of these files to keep.
+
+* Update jconfig.h and jconfigint.h as noted previously.
+
+* Update moz.build to build any new files.
+
+* Finally, tell hg that we've added or removed some files:
+
+ $ hg addremove
+
+== February 8, 2023 (libjpeg-turbo v2.1.5.1 8ecba3647edb6dd940463fedf38ca33a8e2a73d1 2023-02-08) ==
+
+* Updated to v2.1.5.1 release.
+
+== November 10, 2022 (libjpeg-turbo v2.1.4 8162eddf041e0be26f5c671bb6528723c55fed9d 2022-08-12) ==
+
+* Updated to v2.1.4 release.
+
+== February 28, 2022 (libjpeg-turbo v2.1.3 c5f269eb9665435271c05fbcaf8721fa58e9eafa 2022-02-25) ==
+
+* Updated to v2.1.3 release.
+
+== September 9, 2021 (libjpeg-turbo v2.1.1 0a9b9721782d3a60a5c16c8c9a7abf3d4b1ecd42 2020-08-10) ==
+
+* Updated to v2.1.1 release.
+
+== November 19, 2020 (libjpeg-turbo v2.0.6 10ba6ed3365615ed5c2995fe2d240cb2d5000173 2020-11-16) ==
+
+* Updated to v2.0.6 release.
+
+== January 6, 2020 (libjpeg-turbo v2.0.4 166e34213e4f4e2363ce058a7bcc69fd03e38b76 2019-12-31) ==
+
+* Updated to v2.0.4 release.
+
+== September 5, 2019 (libjpeg-turbo v2.0.3 5db6a6819d0f904e0b58f34ae928fea234adb1a0 2019-09-04) ==
+
+* Updated to v2.0.3 release.
+
+== October 4, 2018 (libjpeg-turbo v2.0.0 574f3a772c96dc9db2c98ef24706feb3f6dbda9a 2018-06-27) ==
+
+* Updated to v2.0.0 release.
+
+== July 13, 2017 (libjpeg-turbo v1.5.2 e5c1613ccdfeffcd060fd94248b7c8ac7c0cfb0f 2017-08-09) ==
+
+* Updated to v1.5.2 release.
+
+== September 22, 2016 (libjpeg-turbo v1.5.1 cb88e5da8003afcdc443b787fdcb77285e5a8a02 2016-09-20) ==
+
+* Updated to v1.5.1 release.
+
+== June 23, 2016 (libjpeg-turbo v1.5.0 3ff13e651bbe6de9c6f15d05235d1d4f26f63ffc 2016-05-31) ==
+
+* Updated to v1.5.0 release.
+
+== October 5, 2015 (libjpeg-turbo v1.4.2 d8da49effe6460d55239c4c009c57f42d8e4a494 2015-09-21) ==
+
+* Updated to v1.4.2 release.
+
+== January 15, 2015 (libjpeg-turbo v1.4.0 r1481 2015-01-07) ==
+
+* Updated to v1.4.0 release.
+
+== March 24, 2014 (libjpeg-turbo v1.3.1 r1205 2014-03-22) ==
+
+* Updated to v1.3.1 release.
+
+== November 25, 2013 ==
+
+* Fix bug 891693.
+
+== June 4, 2013 (libjpeg-turbo v1.3.0 r988 2013-05-25) ==
+
+* Updated to v1.3.0 release.
+
+== December 12, 2012 ==
+
+* Replace the runtime computed jpeg_nbits_table with constants in
+ jpeg_nbits_table.h to make it shareable among processes. (bug 815473)
+
+== October 13, 2012 ==
+
+* Modified config.h to use MOZ_ALWAYS_INLINE (bug 800106).
+
+== July 4, 2012 (libjpeg-turbo v1.2.1 r853 2012-06-30) ==
+
+* Updated to v1.2.1 stable release.
+
+== June 5, 2012 (libjpeg-turbo v1.2.x branch, r831 2012-05-30) ==
+
+* Updated to latest version on v1.2.x branch (bug 759891).
+
+== February 10, 2012 (libjpeg-turbo v1.2.0 r807 2012-02-10) ==
+
+* Imported jchuff.c, jdhuff.c, jdhuff.h under new licensing.
+
+* Created mozilla.diff for the required jmorecfg.h changes and to allow for any
+ future changes made by Mozilla to upstream files.
+
+* Removed the following files which are unused by the Mozilla build:
+
+ cderror.h, cdjpeg.h, jconfig.h.in, transupp.h, simd/jsimdcfg.inc.h
+
+
+== March 28, 2011 (initial commit, libjpeg-turbo v1.1.0 r469 2011-02-27) ==
+
+* Modified jmorecfg.h to define UINT8, UINT16, INT16, and INT32 in terms of
+ prtypes to fix a build error on Windows.
+
+* Defined INLINE as NS_ALWAYS_INLINE in jconfig.h.
+
+* Removed the following files which are licensed under the wxWindows license:
+
+ bmp.c, bmp.h, jpegut.c, jpgtest.cxx, rrtimer.h, rrutil.h, turbojpeg.h,
+ turbojpegl.c
+
+* Reverted the following files to what was previously in Mozilla's tree
+ (nominally libjpeg 6.2):
+
+ jchuff.c, jdhuff.c, jdhuff.h
+
+ since the versions of these files in libjpeg-turbo are also under the
+ wxWindows license. (It would have been nicer to revert them to the new
+ libjpeg-8b code, but that doesn't easily integrate with libjpeg-turbo.)
diff --git a/media/libjpeg/README.ijg b/media/libjpeg/README.ijg
new file mode 100644
index 0000000000..9453c19501
--- /dev/null
+++ b/media/libjpeg/README.ijg
@@ -0,0 +1,258 @@
+libjpeg-turbo note: This file has been modified by The libjpeg-turbo Project
+to include only information relevant to libjpeg-turbo, to wordsmith certain
+sections, and to remove impolitic language that existed in the libjpeg v8
+README. It is included only for reference. Please see README.md for
+information specific to libjpeg-turbo.
+
+
+The Independent JPEG Group's JPEG software
+==========================================
+
+This distribution contains a release of the Independent JPEG Group's free JPEG
+software. You are welcome to redistribute this software and to use it for any
+purpose, subject to the conditions under LEGAL ISSUES, below.
+
+This software is the work of Tom Lane, Guido Vollbeding, Philip Gladstone,
+Bill Allombert, Jim Boucher, Lee Crocker, Bob Friesenhahn, Ben Jackson,
+Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi, Ge' Weijers,
+and other members of the Independent JPEG Group.
+
+IJG is not affiliated with the ISO/IEC JTC1/SC29/WG1 standards committee
+(also known as JPEG, together with ITU-T SG16).
+
+
+DOCUMENTATION ROADMAP
+=====================
+
+This file contains the following sections:
+
+OVERVIEW General description of JPEG and the IJG software.
+LEGAL ISSUES Copyright, lack of warranty, terms of distribution.
+REFERENCES Where to learn more about JPEG.
+ARCHIVE LOCATIONS Where to find newer versions of this software.
+FILE FORMAT WARS Software *not* to get.
+TO DO Plans for future IJG releases.
+
+Other documentation files in the distribution are:
+
+User documentation:
+ usage.txt Usage instructions for cjpeg, djpeg, jpegtran,
+ rdjpgcom, and wrjpgcom.
+ *.1 Unix-style man pages for programs (same info as usage.txt).
+ wizard.txt Advanced usage instructions for JPEG wizards only.
+ change.log Version-to-version change highlights.
+Programmer and internal documentation:
+ libjpeg.txt How to use the JPEG library in your own programs.
+ example.txt Sample code for calling the JPEG library.
+ structure.txt Overview of the JPEG library's internal structure.
+ coderules.txt Coding style rules --- please read if you contribute code.
+
+Please read at least usage.txt. Some information can also be found in the JPEG
+FAQ (Frequently Asked Questions) article. See ARCHIVE LOCATIONS below to find
+out where to obtain the FAQ article.
+
+If you want to understand how the JPEG code works, we suggest reading one or
+more of the REFERENCES, then looking at the documentation files (in roughly
+the order listed) before diving into the code.
+
+
+OVERVIEW
+========
+
+This package contains C software to implement JPEG image encoding, decoding,
+and transcoding. JPEG (pronounced "jay-peg") is a standardized compression
+method for full-color and grayscale images. JPEG's strong suit is compressing
+photographic images or other types of images that have smooth color and
+brightness transitions between neighboring pixels. Images with sharp lines or
+other abrupt features may not compress well with JPEG, and a higher JPEG
+quality may have to be used to avoid visible compression artifacts with such
+images.
+
+JPEG is lossy, meaning that the output pixels are not necessarily identical to
+the input pixels. However, on photographic content and other "smooth" images,
+very good compression ratios can be obtained with no visible compression
+artifacts, and extremely high compression ratios are possible if you are
+willing to sacrifice image quality (by reducing the "quality" setting in the
+compressor.)
+
+This software implements JPEG baseline, extended-sequential, and progressive
+compression processes. Provision is made for supporting all variants of these
+processes, although some uncommon parameter settings aren't implemented yet.
+We have made no provision for supporting the hierarchical or lossless
+processes defined in the standard.
+
+We provide a set of library routines for reading and writing JPEG image files,
+plus two sample applications "cjpeg" and "djpeg", which use the library to
+perform conversion between JPEG and some other popular image file formats.
+The library is intended to be reused in other applications.
+
+In order to support file conversion and viewing software, we have included
+considerable functionality beyond the bare JPEG coding/decoding capability;
+for example, the color quantization modules are not strictly part of JPEG
+decoding, but they are essential for output to colormapped file formats or
+colormapped displays. These extra functions can be compiled out of the
+library if not required for a particular application.
+
+We have also included "jpegtran", a utility for lossless transcoding between
+different JPEG processes, and "rdjpgcom" and "wrjpgcom", two simple
+applications for inserting and extracting textual comments in JFIF files.
+
+The emphasis in designing this software has been on achieving portability and
+flexibility, while also making it fast enough to be useful. In particular,
+the software is not intended to be read as a tutorial on JPEG. (See the
+REFERENCES section for introductory material.) Rather, it is intended to
+be reliable, portable, industrial-strength code. We do not claim to have
+achieved that goal in every aspect of the software, but we strive for it.
+
+We welcome the use of this software as a component of commercial products.
+No royalty is required, but we do ask for an acknowledgement in product
+documentation, as described under LEGAL ISSUES.
+
+
+LEGAL ISSUES
+============
+
+In plain English:
+
+1. We don't promise that this software works. (But if you find any bugs,
+ please let us know!)
+2. You can use this software for whatever you want. You don't have to pay us.
+3. You may not pretend that you wrote this software. If you use it in a
+ program, you must acknowledge somewhere in your documentation that
+ you've used the IJG code.
+
+In legalese:
+
+The authors make NO WARRANTY or representation, either express or implied,
+with respect to this software, its quality, accuracy, merchantability, or
+fitness for a particular purpose. This software is provided "AS IS", and you,
+its user, assume the entire risk as to its quality and accuracy.
+
+This software is copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
+All Rights Reserved except as specified below.
+
+Permission is hereby granted to use, copy, modify, and distribute this
+software (or portions thereof) for any purpose, without fee, subject to these
+conditions:
+(1) If any part of the source code for this software is distributed, then this
+README file must be included, with this copyright and no-warranty notice
+unaltered; and any additions, deletions, or changes to the original files
+must be clearly indicated in accompanying documentation.
+(2) If only executable code is distributed, then the accompanying
+documentation must state that "this software is based in part on the work of
+the Independent JPEG Group".
+(3) Permission for use of this software is granted only if the user accepts
+full responsibility for any undesirable consequences; the authors accept
+NO LIABILITY for damages of any kind.
+
+These conditions apply to any software derived from or based on the IJG code,
+not just to the unmodified library. If you use our work, you ought to
+acknowledge us.
+
+Permission is NOT granted for the use of any IJG author's name or company name
+in advertising or publicity relating to this software or products derived from
+it. This software may be referred to only as "the Independent JPEG Group's
+software".
+
+We specifically permit and encourage the use of this software as the basis of
+commercial products, provided that all warranty or liability claims are
+assumed by the product vendor.
+
+
+REFERENCES
+==========
+
+We recommend reading one or more of these references before trying to
+understand the innards of the JPEG software.
+
+The best short technical introduction to the JPEG compression algorithm is
+ Wallace, Gregory K. "The JPEG Still Picture Compression Standard",
+ Communications of the ACM, April 1991 (vol. 34 no. 4), pp. 30-44.
+(Adjacent articles in that issue discuss MPEG motion picture compression,
+applications of JPEG, and related topics.) If you don't have the CACM issue
+handy, a PDF file containing a revised version of Wallace's article is
+available at http://www.ijg.org/files/Wallace.JPEG.pdf. The file (actually
+a preprint for an article that appeared in IEEE Trans. Consumer Electronics)
+omits the sample images that appeared in CACM, but it includes corrections
+and some added material. Note: the Wallace article is copyright ACM and IEEE,
+and it may not be used for commercial purposes.
+
+A somewhat less technical, more leisurely introduction to JPEG can be found in
+"The Data Compression Book" by Mark Nelson and Jean-loup Gailly, published by
+M&T Books (New York), 2nd ed. 1996, ISBN 1-55851-434-1. This book provides
+good explanations and example C code for a multitude of compression methods
+including JPEG. It is an excellent source if you are comfortable reading C
+code but don't know much about data compression in general. The book's JPEG
+sample code is far from industrial-strength, but when you are ready to look
+at a full implementation, you've got one here...
+
+The best currently available description of JPEG is the textbook "JPEG Still
+Image Data Compression Standard" by William B. Pennebaker and Joan L.
+Mitchell, published by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.
+Price US$59.95, 638 pp. The book includes the complete text of the ISO JPEG
+standards (DIS 10918-1 and draft DIS 10918-2).
+
+The original JPEG standard is divided into two parts, Part 1 being the actual
+specification, while Part 2 covers compliance testing methods. Part 1 is
+titled "Digital Compression and Coding of Continuous-tone Still Images,
+Part 1: Requirements and guidelines" and has document numbers ISO/IEC IS
+10918-1, ITU-T T.81. Part 2 is titled "Digital Compression and Coding of
+Continuous-tone Still Images, Part 2: Compliance testing" and has document
+numbers ISO/IEC IS 10918-2, ITU-T T.83.
+
+The JPEG standard does not specify all details of an interchangeable file
+format. For the omitted details, we follow the "JFIF" conventions, revision
+1.02. JFIF version 1 has been adopted as ISO/IEC 10918-5 (05/2013) and
+Recommendation ITU-T T.871 (05/2011): Information technology - Digital
+compression and coding of continuous-tone still images: JPEG File Interchange
+Format (JFIF). It is available as a free download in PDF file format from
+https://www.iso.org/standard/54989.html and http://www.itu.int/rec/T-REC-T.871.
+A PDF file of the older JFIF 1.02 specification is available at
+http://www.w3.org/Graphics/JPEG/jfif3.pdf.
+
+The TIFF 6.0 file format specification can be obtained from
+http://mirrors.ctan.org/graphics/tiff/TIFF6.ps.gz. The JPEG incorporation
+scheme found in the TIFF 6.0 spec of 3-June-92 has a number of serious
+problems. IJG does not recommend use of the TIFF 6.0 design (TIFF Compression
+tag 6). Instead, we recommend the JPEG design proposed by TIFF Technical Note
+#2 (Compression tag 7). Copies of this Note can be obtained from
+http://www.ijg.org/files/. It is expected that the next revision
+of the TIFF spec will replace the 6.0 JPEG design with the Note's design.
+Although IJG's own code does not support TIFF/JPEG, the free libtiff library
+uses our library to implement TIFF/JPEG per the Note.
+
+
+ARCHIVE LOCATIONS
+=================
+
+The "official" archive site for this software is www.ijg.org.
+The most recent released version can always be found there in
+directory "files".
+
+The JPEG FAQ (Frequently Asked Questions) article is a source of some
+general information about JPEG. It is available at
+http://www.faqs.org/faqs/jpeg-faq.
+
+
+FILE FORMAT COMPATIBILITY
+=========================
+
+This software implements ITU T.81 | ISO/IEC 10918 with some extensions from
+ITU T.871 | ISO/IEC 10918-5 (JPEG File Interchange Format-- see REFERENCES).
+Informally, the term "JPEG image" or "JPEG file" most often refers to JFIF or
+a subset thereof, but there are other formats containing the name "JPEG" that
+are incompatible with the DCT-based JPEG standard or with JFIF (for instance,
+JPEG 2000 and JPEG XR). This software therefore does not support these
+formats. Indeed, one of the original reasons for developing this free software
+was to help force convergence on a common, interoperable format standard for
+JPEG files.
+
+JFIF is a minimal or "low end" representation. TIFF/JPEG (TIFF revision 6.0 as
+modified by TIFF Technical Note #2) can be used for "high end" applications
+that need to record a lot of additional data about an image.
+
+
+TO DO
+=====
+
+Please send bug reports, offers of help, etc. to jpeg-info@jpegclub.org.
diff --git a/media/libjpeg/README.md b/media/libjpeg/README.md
new file mode 100644
index 0000000000..01e391ea7c
--- /dev/null
+++ b/media/libjpeg/README.md
@@ -0,0 +1,357 @@
+Background
+==========
+
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
+baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm
+systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
+all else being equal. On other types of systems, libjpeg-turbo can still
+outperform libjpeg by a significant amount, by virtue of its highly-optimized
+Huffman coding routines. In many cases, the performance of libjpeg-turbo
+rivals that of proprietary high-speed JPEG codecs.
+
+libjpeg-turbo implements both the traditional libjpeg API as well as the less
+powerful but more straightforward TurboJPEG API. libjpeg-turbo also features
+colorspace extensions that allow it to compress from/decompress to 32-bit and
+big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java
+interface.
+
+libjpeg-turbo was originally based on libjpeg/SIMD, an MMX-accelerated
+derivative of libjpeg v6b developed by Miyasaka Masaru. The TigerVNC and
+VirtualGL projects made numerous enhancements to the codec in 2009, and in
+early 2010, libjpeg-turbo spun off into an independent project, with the goal
+of making high-speed JPEG compression/decompression technology available to a
+broader range of users and developers.
+
+
+License
+=======
+
+libjpeg-turbo is covered by three compatible BSD-style open source licenses.
+Refer to [LICENSE.md](LICENSE.md) for a roll-up of license terms.
+
+
+Building libjpeg-turbo
+======================
+
+Refer to [BUILDING.md](BUILDING.md) for complete instructions.
+
+
+Using libjpeg-turbo
+===================
+
+libjpeg-turbo includes two APIs that can be used to compress and decompress
+JPEG images:
+
+- **TurboJPEG API**<br>
+ This API provides an easy-to-use interface for compressing and decompressing
+ JPEG images in memory. It also provides some functionality that would not be
+ straightforward to achieve using the underlying libjpeg API, such as
+ generating planar YUV images and performing multiple simultaneous lossless
+ transforms on an image. The Java interface for libjpeg-turbo is written on
+ top of the TurboJPEG API. The TurboJPEG API is recommended for first-time
+ users of libjpeg-turbo. Refer to [tjexample.c](tjexample.c) and
+ [TJExample.java](java/TJExample.java) for examples of its usage and to
+ <http://libjpeg-turbo.org/Documentation/Documentation> for API documentation.
+
+- **libjpeg API**<br>
+ This is the de facto industry-standard API for compressing and decompressing
+ JPEG images. It is more difficult to use than the TurboJPEG API but also
+ more powerful. The libjpeg API implementation in libjpeg-turbo is both
+ API/ABI-compatible and mathematically compatible with libjpeg v6b. It can
+ also optionally be configured to be API/ABI-compatible with libjpeg v7 and v8
+ (see below.) Refer to [cjpeg.c](cjpeg.c) and [djpeg.c](djpeg.c) for examples
+ of its usage and to [libjpeg.txt](libjpeg.txt) for API documentation.
+
+There is no significant performance advantage to either API when both are used
+to perform similar operations.
+
+Colorspace Extensions
+---------------------
+
+libjpeg-turbo includes extensions that allow JPEG images to be compressed
+directly from (and decompressed directly to) buffers that use BGR, BGRX,
+RGBX, XBGR, and XRGB pixel ordering. This is implemented with ten new
+colorspace constants:
+
+ JCS_EXT_RGB /* red/green/blue */
+ JCS_EXT_RGBX /* red/green/blue/x */
+ JCS_EXT_BGR /* blue/green/red */
+ JCS_EXT_BGRX /* blue/green/red/x */
+ JCS_EXT_XBGR /* x/blue/green/red */
+ JCS_EXT_XRGB /* x/red/green/blue */
+ JCS_EXT_RGBA /* red/green/blue/alpha */
+ JCS_EXT_BGRA /* blue/green/red/alpha */
+ JCS_EXT_ABGR /* alpha/blue/green/red */
+ JCS_EXT_ARGB /* alpha/red/green/blue */
+
+Setting `cinfo.in_color_space` (compression) or `cinfo.out_color_space`
+(decompression) to one of these values will cause libjpeg-turbo to read the
+red, green, and blue values from (or write them to) the appropriate position in
+the pixel when compressing from/decompressing to an RGB buffer.
+
+Your application can check for the existence of these extensions at compile
+time with:
+
+ #ifdef JCS_EXTENSIONS
+
+At run time, attempting to use these extensions with a libjpeg implementation
+that does not support them will result in a "Bogus input colorspace" error.
+Applications can trap this error in order to test whether run-time support is
+available for the colorspace extensions.
+
+When using the RGBX, BGRX, XBGR, and XRGB colorspaces during decompression, the
+X byte is undefined, and in order to ensure the best performance, libjpeg-turbo
+can set that byte to whatever value it wishes. If an application expects the X
+byte to be used as an alpha channel, then it should specify `JCS_EXT_RGBA`,
+`JCS_EXT_BGRA`, `JCS_EXT_ABGR`, or `JCS_EXT_ARGB`. When these colorspace
+constants are used, the X byte is guaranteed to be 0xFF, which is interpreted
+as opaque.
+
+Your application can check for the existence of the alpha channel colorspace
+extensions at compile time with:
+
+ #ifdef JCS_ALPHA_EXTENSIONS
+
+[jcstest.c](jcstest.c), located in the libjpeg-turbo source tree, demonstrates
+how to check for the existence of the colorspace extensions at compile time and
+run time.
+
+libjpeg v7 and v8 API/ABI Emulation
+-----------------------------------
+
+With libjpeg v7 and v8, new features were added that necessitated extending the
+compression and decompression structures. Unfortunately, due to the exposed
+nature of those structures, extending them also necessitated breaking backward
+ABI compatibility with previous libjpeg releases. Thus, programs that were
+built to use libjpeg v7 or v8 did not work with libjpeg-turbo, since it is
+based on the libjpeg v6b code base. Although libjpeg v7 and v8 are not
+as widely used as v6b, enough programs (including a few Linux distros) made
+the switch that there was a demand to emulate the libjpeg v7 and v8 ABIs
+in libjpeg-turbo. It should be noted, however, that this feature was added
+primarily so that applications that had already been compiled to use libjpeg
+v7+ could take advantage of accelerated baseline JPEG encoding/decoding
+without recompiling. libjpeg-turbo does not claim to support all of the
+libjpeg v7+ features, nor to produce identical output to libjpeg v7+ in all
+cases (see below.)
+
+By passing an argument of `-DWITH_JPEG7=1` or `-DWITH_JPEG8=1` to `cmake`, you
+can build a version of libjpeg-turbo that emulates the libjpeg v7 or v8 ABI, so
+that programs that are built against libjpeg v7 or v8 can be run with
+libjpeg-turbo. The following section describes which libjpeg v7+ features are
+supported and which aren't.
+
+### Support for libjpeg v7 and v8 Features
+
+#### Fully supported
+
+- **libjpeg API: IDCT scaling extensions in decompressor**<br>
+ libjpeg-turbo supports IDCT scaling with scaling factors of 1/8, 1/4, 3/8,
+ 1/2, 5/8, 3/4, 7/8, 9/8, 5/4, 11/8, 3/2, 13/8, 7/4, 15/8, and 2/1 (only 1/4
+ and 1/2 are SIMD-accelerated.)
+
+- **libjpeg API: Arithmetic coding**
+
+- **libjpeg API: In-memory source and destination managers**<br>
+ See notes below.
+
+- **cjpeg: Separate quality settings for luminance and chrominance**<br>
+ Note that the libpjeg v7+ API was extended to accommodate this feature only
+ for convenience purposes. It has always been possible to implement this
+ feature with libjpeg v6b (see rdswitch.c for an example.)
+
+- **cjpeg: 32-bit BMP support**
+
+- **cjpeg: `-rgb` option**
+
+- **jpegtran: Lossless cropping**
+
+- **jpegtran: `-perfect` option**
+
+- **jpegtran: Forcing width/height when performing lossless crop**
+
+- **rdjpgcom: `-raw` option**
+
+- **rdjpgcom: Locale awareness**
+
+
+#### Not supported
+
+NOTE: As of this writing, extensive research has been conducted into the
+usefulness of DCT scaling as a means of data reduction and SmartScale as a
+means of quality improvement. Readers are invited to peruse the research at
+<http://www.libjpeg-turbo.org/About/SmartScale> and draw their own conclusions,
+but it is the general belief of our project that these features have not
+demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
+
+- **libjpeg API: DCT scaling in compressor**<br>
+ `cinfo.scale_num` and `cinfo.scale_denom` are silently ignored.
+ There is no technical reason why DCT scaling could not be supported when
+ emulating the libjpeg v7+ API/ABI, but without the SmartScale extension (see
+ below), only scaling factors of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and
+ 8/9 would be available, which is of limited usefulness.
+
+- **libjpeg API: SmartScale**<br>
+ `cinfo.block_size` is silently ignored.
+ SmartScale is an extension to the JPEG format that allows for DCT block
+ sizes other than 8x8. Providing support for this new format would be
+ feasible (particularly without full acceleration.) However, until/unless
+ the format becomes either an official industry standard or, at minimum, an
+ accepted solution in the community, we are hesitant to implement it, as
+ there is no sense of whether or how it might change in the future. It is
+ our belief that SmartScale has not demonstrated sufficient usefulness as a
+ lossless format nor as a means of quality enhancement, and thus our primary
+ interest in providing this feature would be as a means of supporting
+ additional DCT scaling factors.
+
+- **libjpeg API: Fancy downsampling in compressor**<br>
+ `cinfo.do_fancy_downsampling` is silently ignored.
+ This requires the DCT scaling feature, which is not supported.
+
+- **jpegtran: Scaling**<br>
+ This requires both the DCT scaling and SmartScale features, which are not
+ supported.
+
+- **Lossless RGB JPEG files**<br>
+ This requires the SmartScale feature, which is not supported.
+
+### What About libjpeg v9?
+
+libjpeg v9 introduced yet another field to the JPEG compression structure
+(`color_transform`), thus making the ABI backward incompatible with that of
+libjpeg v8. This new field was introduced solely for the purpose of supporting
+lossless SmartScale encoding. Furthermore, there was actually no reason to
+extend the API in this manner, as the color transform could have just as easily
+been activated by way of a new JPEG colorspace constant, thus preserving
+backward ABI compatibility.
+
+Our research (see link above) has shown that lossless SmartScale does not
+generally accomplish anything that can't already be accomplished better with
+existing, standard lossless formats. Therefore, at this time it is our belief
+that there is not sufficient technical justification for software projects to
+upgrade from libjpeg v8 to libjpeg v9, and thus there is not sufficient
+technical justification for us to emulate the libjpeg v9 ABI.
+
+In-Memory Source/Destination Managers
+-------------------------------------
+
+By default, libjpeg-turbo 1.3 and later includes the `jpeg_mem_src()` and
+`jpeg_mem_dest()` functions, even when not emulating the libjpeg v8 API/ABI.
+Previously, it was necessary to build libjpeg-turbo from source with libjpeg v8
+API/ABI emulation in order to use the in-memory source/destination managers,
+but several projects requested that those functions be included when emulating
+the libjpeg v6b API/ABI as well. This allows the use of those functions by
+programs that need them, without breaking ABI compatibility for programs that
+don't, and it allows those functions to be provided in the "official"
+libjpeg-turbo binaries.
+
+Those who are concerned about maintaining strict conformance with the libjpeg
+v6b or v7 API can pass an argument of `-DWITH_MEM_SRCDST=0` to `cmake` prior to
+building libjpeg-turbo. This will restore the pre-1.3 behavior, in which
+`jpeg_mem_src()` and `jpeg_mem_dest()` are only included when emulating the
+libjpeg v8 API/ABI.
+
+On Un*x systems, including the in-memory source/destination managers changes
+the dynamic library version from 62.2.0 to 62.3.0 if using libjpeg v6b API/ABI
+emulation and from 7.2.0 to 7.3.0 if using libjpeg v7 API/ABI emulation.
+
+Note that, on most Un*x systems, the dynamic linker will not look for a
+function in a library until that function is actually used. Thus, if a program
+is built against libjpeg-turbo 1.3+ and uses `jpeg_mem_src()` or
+`jpeg_mem_dest()`, that program will not fail if run against an older version
+of libjpeg-turbo or against libjpeg v7- until the program actually tries to
+call `jpeg_mem_src()` or `jpeg_mem_dest()`. Such is not the case on Windows.
+If a program is built against the libjpeg-turbo 1.3+ DLL and uses
+`jpeg_mem_src()` or `jpeg_mem_dest()`, then it must use the libjpeg-turbo 1.3+
+DLL at run time.
+
+Both cjpeg and djpeg have been extended to allow testing the in-memory
+source/destination manager functions. See their respective man pages for more
+details.
+
+
+Mathematical Compatibility
+==========================
+
+For the most part, libjpeg-turbo should produce identical output to libjpeg
+v6b. The one exception to this is when using the floating point DCT/IDCT, in
+which case the outputs of libjpeg v6b and libjpeg-turbo can differ for the
+following reasons:
+
+- The SSE/SSE2 floating point DCT implementation in libjpeg-turbo is ever so
+ slightly more accurate than the implementation in libjpeg v6b, but not by
+ any amount perceptible to human vision (generally in the range of 0.01 to
+ 0.08 dB gain in PNSR.)
+
+- When not using the SIMD extensions, libjpeg-turbo uses the more accurate
+ (and slightly faster) floating point IDCT algorithm introduced in libjpeg
+ v8a as opposed to the algorithm used in libjpeg v6b. It should be noted,
+ however, that this algorithm basically brings the accuracy of the floating
+ point IDCT in line with the accuracy of the accurate integer IDCT. The
+ floating point DCT/IDCT algorithms are mainly a legacy feature, and they do
+ not produce significantly more accuracy than the accurate integer algorithms
+ (to put numbers on this, the typical difference in PNSR between the two
+ algorithms is less than 0.10 dB, whereas changing the quality level by 1 in
+ the upper range of the quality scale is typically more like a 1.0 dB
+ difference.)
+
+- If the floating point algorithms in libjpeg-turbo are not implemented using
+ SIMD instructions on a particular platform, then the accuracy of the
+ floating point DCT/IDCT can depend on the compiler settings.
+
+While libjpeg-turbo does emulate the libjpeg v8 API/ABI, under the hood it is
+still using the same algorithms as libjpeg v6b, so there are several specific
+cases in which libjpeg-turbo cannot be expected to produce the same output as
+libjpeg v8:
+
+- When decompressing using scaling factors of 1/2 and 1/4, because libjpeg v8
+ implements those scaling algorithms differently than libjpeg v6b does, and
+ libjpeg-turbo's SIMD extensions are based on the libjpeg v6b behavior.
+
+- When using chrominance subsampling, because libjpeg v8 implements this
+ with its DCT/IDCT scaling algorithms rather than with a separate
+ downsampling/upsampling algorithm. In our testing, the subsampled/upsampled
+ output of libjpeg v8 is less accurate than that of libjpeg v6b for this
+ reason.
+
+- When decompressing using a scaling factor > 1 and merged (AKA "non-fancy" or
+ "non-smooth") chrominance upsampling, because libjpeg v8 does not support
+ merged upsampling with scaling factors > 1.
+
+
+Performance Pitfalls
+====================
+
+Restart Markers
+---------------
+
+The optimized Huffman decoder in libjpeg-turbo does not handle restart markers
+in a way that makes the rest of the libjpeg infrastructure happy, so it is
+necessary to use the slow Huffman decoder when decompressing a JPEG image that
+has restart markers. This can cause the decompression performance to drop by
+as much as 20%, but the performance will still be much greater than that of
+libjpeg. Many consumer packages, such as Photoshop, use restart markers when
+generating JPEG images, so images generated by those programs will experience
+this issue.
+
+Fast Integer Forward DCT at High Quality Levels
+-----------------------------------------------
+
+The algorithm used by the SIMD-accelerated quantization function cannot produce
+correct results whenever the fast integer forward DCT is used along with a JPEG
+quality of 98-100. Thus, libjpeg-turbo must use the non-SIMD quantization
+function in those cases. This causes performance to drop by as much as 40%.
+It is therefore strongly advised that you use the accurate integer forward DCT
+whenever encoding images with a JPEG quality of 98 or higher.
+
+
+Memory Debugger Pitfalls
+========================
+
+Valgrind and Memory Sanitizer (MSan) can generate false positives
+(specifically, incorrect reports of uninitialized memory accesses) when used
+with libjpeg-turbo's SIMD extensions. It is generally recommended that the
+SIMD extensions be disabled, either by passing an argument of `-DWITH_SIMD=0`
+to `cmake` when configuring the build or by setting the environment variable
+`JSIMD_FORCENONE` to `1` at run time, when testing libjpeg-turbo with Valgrind,
+MSan, or other memory debuggers.
diff --git a/media/libjpeg/jaricom.c b/media/libjpeg/jaricom.c
new file mode 100644
index 0000000000..215640cc44
--- /dev/null
+++ b/media/libjpeg/jaricom.c
@@ -0,0 +1,157 @@
+/*
+ * jaricom.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Developed 1997-2009 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, 2018, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains probability estimation tables for common use in
+ * arithmetic entropy encoding and decoding routines.
+ *
+ * This data represents Table D.2 in
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994 and Table 24 in
+ * Recommendation ITU-T T.82 (1993) | ISO/IEC 11544:1993.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+/* The following #define specifies the packing of the four components
+ * into the compact JLONG representation.
+ * Note that this formula must match the actual arithmetic encoder
+ * and decoder implementation. The implementation has to be changed
+ * if this formula is changed.
+ * The current organization is leaned on Markus Kuhn's JBIG
+ * implementation (jbig_tab.c).
+ */
+
+#define V(i, a, b, c, d) \
+ (((JLONG)a << 16) | ((JLONG)c << 8) | ((JLONG)d << 7) | b)
+
+const JLONG jpeg_aritab[113 + 1] = {
+/*
+ * Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS
+ */
+ V( 0, 0x5a1d, 1, 1, 1 ),
+ V( 1, 0x2586, 14, 2, 0 ),
+ V( 2, 0x1114, 16, 3, 0 ),
+ V( 3, 0x080b, 18, 4, 0 ),
+ V( 4, 0x03d8, 20, 5, 0 ),
+ V( 5, 0x01da, 23, 6, 0 ),
+ V( 6, 0x00e5, 25, 7, 0 ),
+ V( 7, 0x006f, 28, 8, 0 ),
+ V( 8, 0x0036, 30, 9, 0 ),
+ V( 9, 0x001a, 33, 10, 0 ),
+ V( 10, 0x000d, 35, 11, 0 ),
+ V( 11, 0x0006, 9, 12, 0 ),
+ V( 12, 0x0003, 10, 13, 0 ),
+ V( 13, 0x0001, 12, 13, 0 ),
+ V( 14, 0x5a7f, 15, 15, 1 ),
+ V( 15, 0x3f25, 36, 16, 0 ),
+ V( 16, 0x2cf2, 38, 17, 0 ),
+ V( 17, 0x207c, 39, 18, 0 ),
+ V( 18, 0x17b9, 40, 19, 0 ),
+ V( 19, 0x1182, 42, 20, 0 ),
+ V( 20, 0x0cef, 43, 21, 0 ),
+ V( 21, 0x09a1, 45, 22, 0 ),
+ V( 22, 0x072f, 46, 23, 0 ),
+ V( 23, 0x055c, 48, 24, 0 ),
+ V( 24, 0x0406, 49, 25, 0 ),
+ V( 25, 0x0303, 51, 26, 0 ),
+ V( 26, 0x0240, 52, 27, 0 ),
+ V( 27, 0x01b1, 54, 28, 0 ),
+ V( 28, 0x0144, 56, 29, 0 ),
+ V( 29, 0x00f5, 57, 30, 0 ),
+ V( 30, 0x00b7, 59, 31, 0 ),
+ V( 31, 0x008a, 60, 32, 0 ),
+ V( 32, 0x0068, 62, 33, 0 ),
+ V( 33, 0x004e, 63, 34, 0 ),
+ V( 34, 0x003b, 32, 35, 0 ),
+ V( 35, 0x002c, 33, 9, 0 ),
+ V( 36, 0x5ae1, 37, 37, 1 ),
+ V( 37, 0x484c, 64, 38, 0 ),
+ V( 38, 0x3a0d, 65, 39, 0 ),
+ V( 39, 0x2ef1, 67, 40, 0 ),
+ V( 40, 0x261f, 68, 41, 0 ),
+ V( 41, 0x1f33, 69, 42, 0 ),
+ V( 42, 0x19a8, 70, 43, 0 ),
+ V( 43, 0x1518, 72, 44, 0 ),
+ V( 44, 0x1177, 73, 45, 0 ),
+ V( 45, 0x0e74, 74, 46, 0 ),
+ V( 46, 0x0bfb, 75, 47, 0 ),
+ V( 47, 0x09f8, 77, 48, 0 ),
+ V( 48, 0x0861, 78, 49, 0 ),
+ V( 49, 0x0706, 79, 50, 0 ),
+ V( 50, 0x05cd, 48, 51, 0 ),
+ V( 51, 0x04de, 50, 52, 0 ),
+ V( 52, 0x040f, 50, 53, 0 ),
+ V( 53, 0x0363, 51, 54, 0 ),
+ V( 54, 0x02d4, 52, 55, 0 ),
+ V( 55, 0x025c, 53, 56, 0 ),
+ V( 56, 0x01f8, 54, 57, 0 ),
+ V( 57, 0x01a4, 55, 58, 0 ),
+ V( 58, 0x0160, 56, 59, 0 ),
+ V( 59, 0x0125, 57, 60, 0 ),
+ V( 60, 0x00f6, 58, 61, 0 ),
+ V( 61, 0x00cb, 59, 62, 0 ),
+ V( 62, 0x00ab, 61, 63, 0 ),
+ V( 63, 0x008f, 61, 32, 0 ),
+ V( 64, 0x5b12, 65, 65, 1 ),
+ V( 65, 0x4d04, 80, 66, 0 ),
+ V( 66, 0x412c, 81, 67, 0 ),
+ V( 67, 0x37d8, 82, 68, 0 ),
+ V( 68, 0x2fe8, 83, 69, 0 ),
+ V( 69, 0x293c, 84, 70, 0 ),
+ V( 70, 0x2379, 86, 71, 0 ),
+ V( 71, 0x1edf, 87, 72, 0 ),
+ V( 72, 0x1aa9, 87, 73, 0 ),
+ V( 73, 0x174e, 72, 74, 0 ),
+ V( 74, 0x1424, 72, 75, 0 ),
+ V( 75, 0x119c, 74, 76, 0 ),
+ V( 76, 0x0f6b, 74, 77, 0 ),
+ V( 77, 0x0d51, 75, 78, 0 ),
+ V( 78, 0x0bb6, 77, 79, 0 ),
+ V( 79, 0x0a40, 77, 48, 0 ),
+ V( 80, 0x5832, 80, 81, 1 ),
+ V( 81, 0x4d1c, 88, 82, 0 ),
+ V( 82, 0x438e, 89, 83, 0 ),
+ V( 83, 0x3bdd, 90, 84, 0 ),
+ V( 84, 0x34ee, 91, 85, 0 ),
+ V( 85, 0x2eae, 92, 86, 0 ),
+ V( 86, 0x299a, 93, 87, 0 ),
+ V( 87, 0x2516, 86, 71, 0 ),
+ V( 88, 0x5570, 88, 89, 1 ),
+ V( 89, 0x4ca9, 95, 90, 0 ),
+ V( 90, 0x44d9, 96, 91, 0 ),
+ V( 91, 0x3e22, 97, 92, 0 ),
+ V( 92, 0x3824, 99, 93, 0 ),
+ V( 93, 0x32b4, 99, 94, 0 ),
+ V( 94, 0x2e17, 93, 86, 0 ),
+ V( 95, 0x56a8, 95, 96, 1 ),
+ V( 96, 0x4f46, 101, 97, 0 ),
+ V( 97, 0x47e5, 102, 98, 0 ),
+ V( 98, 0x41cf, 103, 99, 0 ),
+ V( 99, 0x3c3d, 104, 100, 0 ),
+ V( 100, 0x375e, 99, 93, 0 ),
+ V( 101, 0x5231, 105, 102, 0 ),
+ V( 102, 0x4c0f, 106, 103, 0 ),
+ V( 103, 0x4639, 107, 104, 0 ),
+ V( 104, 0x415e, 103, 99, 0 ),
+ V( 105, 0x5627, 105, 106, 1 ),
+ V( 106, 0x50e7, 108, 107, 0 ),
+ V( 107, 0x4b85, 109, 103, 0 ),
+ V( 108, 0x5597, 110, 109, 0 ),
+ V( 109, 0x504f, 111, 107, 0 ),
+ V( 110, 0x5a10, 110, 111, 1 ),
+ V( 111, 0x5522, 112, 109, 0 ),
+ V( 112, 0x59eb, 112, 111, 1 ),
+/*
+ * This last entry is used for fixed probability estimate of 0.5
+ * as recommended in Section 10.3 Table 5 of ITU-T Rec. T.851.
+ */
+ V( 113, 0x5a1d, 113, 113, 0 )
+};
diff --git a/media/libjpeg/jcapimin.c b/media/libjpeg/jcapimin.c
new file mode 100644
index 0000000000..84e7ecc9a7
--- /dev/null
+++ b/media/libjpeg/jcapimin.c
@@ -0,0 +1,295 @@
+/*
+ * jcapimin.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2003-2010 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains application interface code for the compression half
+ * of the JPEG library. These are the "minimum" API routines that may be
+ * needed in either the normal full-compression case or the transcoding-only
+ * case.
+ *
+ * Most of the routines intended to be called directly by an application
+ * are in this file or in jcapistd.c. But also see jcparam.c for
+ * parameter-setup helper routines, jcomapi.c for routines shared by
+ * compression and decompression, and jctrans.c for the transcoding case.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/*
+ * Initialization of a JPEG compression object.
+ * The error manager must already be set up (in case memory manager fails).
+ */
+
+GLOBAL(void)
+jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize)
+{
+ int i;
+
+ /* Guard against version mismatches between library and caller. */
+ cinfo->mem = NULL; /* so jpeg_destroy knows mem mgr not called */
+ if (version != JPEG_LIB_VERSION)
+ ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version);
+ if (structsize != sizeof(struct jpeg_compress_struct))
+ ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE,
+ (int)sizeof(struct jpeg_compress_struct), (int)structsize);
+
+ /* For debugging purposes, we zero the whole master structure.
+ * But the application has already set the err pointer, and may have set
+ * client_data, so we have to save and restore those fields.
+ * Note: if application hasn't set client_data, tools like Purify may
+ * complain here.
+ */
+ {
+ struct jpeg_error_mgr *err = cinfo->err;
+ void *client_data = cinfo->client_data; /* ignore Purify complaint here */
+ memset(cinfo, 0, sizeof(struct jpeg_compress_struct));
+ cinfo->err = err;
+ cinfo->client_data = client_data;
+ }
+ cinfo->is_decompressor = FALSE;
+
+ /* Initialize a memory manager instance for this object */
+ jinit_memory_mgr((j_common_ptr)cinfo);
+
+ /* Zero out pointers to permanent structures. */
+ cinfo->progress = NULL;
+ cinfo->dest = NULL;
+
+ cinfo->comp_info = NULL;
+
+ for (i = 0; i < NUM_QUANT_TBLS; i++) {
+ cinfo->quant_tbl_ptrs[i] = NULL;
+#if JPEG_LIB_VERSION >= 70
+ cinfo->q_scale_factor[i] = 100;
+#endif
+ }
+
+ for (i = 0; i < NUM_HUFF_TBLS; i++) {
+ cinfo->dc_huff_tbl_ptrs[i] = NULL;
+ cinfo->ac_huff_tbl_ptrs[i] = NULL;
+ }
+
+#if JPEG_LIB_VERSION >= 80
+ /* Must do it here for emit_dqt in case jpeg_write_tables is used */
+ cinfo->block_size = DCTSIZE;
+ cinfo->natural_order = jpeg_natural_order;
+ cinfo->lim_Se = DCTSIZE2 - 1;
+#endif
+
+ cinfo->script_space = NULL;
+
+ cinfo->input_gamma = 1.0; /* in case application forgets */
+
+ /* OK, I'm ready */
+ cinfo->global_state = CSTATE_START;
+}
+
+
+/*
+ * Destruction of a JPEG compression object
+ */
+
+GLOBAL(void)
+jpeg_destroy_compress(j_compress_ptr cinfo)
+{
+ jpeg_destroy((j_common_ptr)cinfo); /* use common routine */
+}
+
+
+/*
+ * Abort processing of a JPEG compression operation,
+ * but don't destroy the object itself.
+ */
+
+GLOBAL(void)
+jpeg_abort_compress(j_compress_ptr cinfo)
+{
+ jpeg_abort((j_common_ptr)cinfo); /* use common routine */
+}
+
+
+/*
+ * Forcibly suppress or un-suppress all quantization and Huffman tables.
+ * Marks all currently defined tables as already written (if suppress)
+ * or not written (if !suppress). This will control whether they get emitted
+ * by a subsequent jpeg_start_compress call.
+ *
+ * This routine is exported for use by applications that want to produce
+ * abbreviated JPEG datastreams. It logically belongs in jcparam.c, but
+ * since it is called by jpeg_start_compress, we put it here --- otherwise
+ * jcparam.o would be linked whether the application used it or not.
+ */
+
+GLOBAL(void)
+jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress)
+{
+ int i;
+ JQUANT_TBL *qtbl;
+ JHUFF_TBL *htbl;
+
+ for (i = 0; i < NUM_QUANT_TBLS; i++) {
+ if ((qtbl = cinfo->quant_tbl_ptrs[i]) != NULL)
+ qtbl->sent_table = suppress;
+ }
+
+ for (i = 0; i < NUM_HUFF_TBLS; i++) {
+ if ((htbl = cinfo->dc_huff_tbl_ptrs[i]) != NULL)
+ htbl->sent_table = suppress;
+ if ((htbl = cinfo->ac_huff_tbl_ptrs[i]) != NULL)
+ htbl->sent_table = suppress;
+ }
+}
+
+
+/*
+ * Finish JPEG compression.
+ *
+ * If a multipass operating mode was selected, this may do a great deal of
+ * work including most of the actual output.
+ */
+
+GLOBAL(void)
+jpeg_finish_compress(j_compress_ptr cinfo)
+{
+ JDIMENSION iMCU_row;
+
+ if (cinfo->global_state == CSTATE_SCANNING ||
+ cinfo->global_state == CSTATE_RAW_OK) {
+ /* Terminate first pass */
+ if (cinfo->next_scanline < cinfo->image_height)
+ ERREXIT(cinfo, JERR_TOO_LITTLE_DATA);
+ (*cinfo->master->finish_pass) (cinfo);
+ } else if (cinfo->global_state != CSTATE_WRCOEFS)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ /* Perform any remaining passes */
+ while (!cinfo->master->is_last_pass) {
+ (*cinfo->master->prepare_for_pass) (cinfo);
+ for (iMCU_row = 0; iMCU_row < cinfo->total_iMCU_rows; iMCU_row++) {
+ if (cinfo->progress != NULL) {
+ cinfo->progress->pass_counter = (long)iMCU_row;
+ cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows;
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
+ }
+ /* We bypass the main controller and invoke coef controller directly;
+ * all work is being done from the coefficient buffer.
+ */
+ if (!(*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE)NULL))
+ ERREXIT(cinfo, JERR_CANT_SUSPEND);
+ }
+ (*cinfo->master->finish_pass) (cinfo);
+ }
+ /* Write EOI, do final cleanup */
+ (*cinfo->marker->write_file_trailer) (cinfo);
+ (*cinfo->dest->term_destination) (cinfo);
+ /* We can use jpeg_abort to release memory and reset global_state */
+ jpeg_abort((j_common_ptr)cinfo);
+}
+
+
+/*
+ * Write a special marker.
+ * This is only recommended for writing COM or APPn markers.
+ * Must be called after jpeg_start_compress() and before
+ * first call to jpeg_write_scanlines() or jpeg_write_raw_data().
+ */
+
+GLOBAL(void)
+jpeg_write_marker(j_compress_ptr cinfo, int marker, const JOCTET *dataptr,
+ unsigned int datalen)
+{
+ void (*write_marker_byte) (j_compress_ptr info, int val);
+
+ if (cinfo->next_scanline != 0 ||
+ (cinfo->global_state != CSTATE_SCANNING &&
+ cinfo->global_state != CSTATE_RAW_OK &&
+ cinfo->global_state != CSTATE_WRCOEFS))
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ (*cinfo->marker->write_marker_header) (cinfo, marker, datalen);
+ write_marker_byte = cinfo->marker->write_marker_byte; /* copy for speed */
+ while (datalen--) {
+ (*write_marker_byte) (cinfo, *dataptr);
+ dataptr++;
+ }
+}
+
+/* Same, but piecemeal. */
+
+GLOBAL(void)
+jpeg_write_m_header(j_compress_ptr cinfo, int marker, unsigned int datalen)
+{
+ if (cinfo->next_scanline != 0 ||
+ (cinfo->global_state != CSTATE_SCANNING &&
+ cinfo->global_state != CSTATE_RAW_OK &&
+ cinfo->global_state != CSTATE_WRCOEFS))
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ (*cinfo->marker->write_marker_header) (cinfo, marker, datalen);
+}
+
+GLOBAL(void)
+jpeg_write_m_byte(j_compress_ptr cinfo, int val)
+{
+ (*cinfo->marker->write_marker_byte) (cinfo, val);
+}
+
+
+/*
+ * Alternate compression function: just write an abbreviated table file.
+ * Before calling this, all parameters and a data destination must be set up.
+ *
+ * To produce a pair of files containing abbreviated tables and abbreviated
+ * image data, one would proceed as follows:
+ *
+ * initialize JPEG object
+ * set JPEG parameters
+ * set destination to table file
+ * jpeg_write_tables(cinfo);
+ * set destination to image file
+ * jpeg_start_compress(cinfo, FALSE);
+ * write data...
+ * jpeg_finish_compress(cinfo);
+ *
+ * jpeg_write_tables has the side effect of marking all tables written
+ * (same as jpeg_suppress_tables(..., TRUE)). Thus a subsequent start_compress
+ * will not re-emit the tables unless it is passed write_all_tables=TRUE.
+ */
+
+GLOBAL(void)
+jpeg_write_tables(j_compress_ptr cinfo)
+{
+ if (cinfo->global_state != CSTATE_START)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ /* (Re)initialize error mgr and destination modules */
+ (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
+ (*cinfo->dest->init_destination) (cinfo);
+ /* Initialize the marker writer ... bit of a crock to do it here. */
+ jinit_marker_writer(cinfo);
+ /* Write them tables! */
+ (*cinfo->marker->write_tables_only) (cinfo);
+ /* And clean up. */
+ (*cinfo->dest->term_destination) (cinfo);
+ /*
+ * In library releases up through v6a, we called jpeg_abort() here to free
+ * any working memory allocated by the destination manager and marker
+ * writer. Some applications had a problem with that: they allocated space
+ * of their own from the library memory manager, and didn't want it to go
+ * away during write_tables. So now we do nothing. This will cause a
+ * memory leak if an app calls write_tables repeatedly without doing a full
+ * compression cycle or otherwise resetting the JPEG object. However, that
+ * seems less bad than unexpectedly freeing memory in the normal case.
+ * An app that prefers the old behavior can call jpeg_abort for itself after
+ * each call to jpeg_write_tables().
+ */
+}
diff --git a/media/libjpeg/jcapistd.c b/media/libjpeg/jcapistd.c
new file mode 100644
index 0000000000..aa2aad9f66
--- /dev/null
+++ b/media/libjpeg/jcapistd.c
@@ -0,0 +1,162 @@
+/*
+ * jcapistd.c
+ *
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains application interface code for the compression half
+ * of the JPEG library. These are the "standard" API routines that are
+ * used in the normal full-compression case. They are not used by a
+ * transcoding-only application. Note that if an application links in
+ * jpeg_start_compress, it will end up linking in the entire compressor.
+ * We thus must separate this file from jcapimin.c to avoid linking the
+ * whole compression library into a transcoder.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/*
+ * Compression initialization.
+ * Before calling this, all parameters and a data destination must be set up.
+ *
+ * We require a write_all_tables parameter as a failsafe check when writing
+ * multiple datastreams from the same compression object. Since prior runs
+ * will have left all the tables marked sent_table=TRUE, a subsequent run
+ * would emit an abbreviated stream (no tables) by default. This may be what
+ * is wanted, but for safety's sake it should not be the default behavior:
+ * programmers should have to make a deliberate choice to emit abbreviated
+ * images. Therefore the documentation and examples should encourage people
+ * to pass write_all_tables=TRUE; then it will take active thought to do the
+ * wrong thing.
+ */
+
+GLOBAL(void)
+jpeg_start_compress(j_compress_ptr cinfo, boolean write_all_tables)
+{
+ if (cinfo->global_state != CSTATE_START)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ if (write_all_tables)
+ jpeg_suppress_tables(cinfo, FALSE); /* mark all tables to be written */
+
+ /* (Re)initialize error mgr and destination modules */
+ (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
+ (*cinfo->dest->init_destination) (cinfo);
+ /* Perform master selection of active modules */
+ jinit_compress_master(cinfo);
+ /* Set up for the first pass */
+ (*cinfo->master->prepare_for_pass) (cinfo);
+ /* Ready for application to drive first pass through jpeg_write_scanlines
+ * or jpeg_write_raw_data.
+ */
+ cinfo->next_scanline = 0;
+ cinfo->global_state = (cinfo->raw_data_in ? CSTATE_RAW_OK : CSTATE_SCANNING);
+}
+
+
+/*
+ * Write some scanlines of data to the JPEG compressor.
+ *
+ * The return value will be the number of lines actually written.
+ * This should be less than the supplied num_lines only in case that
+ * the data destination module has requested suspension of the compressor,
+ * or if more than image_height scanlines are passed in.
+ *
+ * Note: we warn about excess calls to jpeg_write_scanlines() since
+ * this likely signals an application programmer error. However,
+ * excess scanlines passed in the last valid call are *silently* ignored,
+ * so that the application need not adjust num_lines for end-of-image
+ * when using a multiple-scanline buffer.
+ */
+
+GLOBAL(JDIMENSION)
+jpeg_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+ JDIMENSION num_lines)
+{
+ JDIMENSION row_ctr, rows_left;
+
+ if (cinfo->global_state != CSTATE_SCANNING)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ if (cinfo->next_scanline >= cinfo->image_height)
+ WARNMS(cinfo, JWRN_TOO_MUCH_DATA);
+
+ /* Call progress monitor hook if present */
+ if (cinfo->progress != NULL) {
+ cinfo->progress->pass_counter = (long)cinfo->next_scanline;
+ cinfo->progress->pass_limit = (long)cinfo->image_height;
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
+ }
+
+ /* Give master control module another chance if this is first call to
+ * jpeg_write_scanlines. This lets output of the frame/scan headers be
+ * delayed so that application can write COM, etc, markers between
+ * jpeg_start_compress and jpeg_write_scanlines.
+ */
+ if (cinfo->master->call_pass_startup)
+ (*cinfo->master->pass_startup) (cinfo);
+
+ /* Ignore any extra scanlines at bottom of image. */
+ rows_left = cinfo->image_height - cinfo->next_scanline;
+ if (num_lines > rows_left)
+ num_lines = rows_left;
+
+ row_ctr = 0;
+ (*cinfo->main->process_data) (cinfo, scanlines, &row_ctr, num_lines);
+ cinfo->next_scanline += row_ctr;
+ return row_ctr;
+}
+
+
+/*
+ * Alternate entry point to write raw data.
+ * Processes exactly one iMCU row per call, unless suspended.
+ */
+
+GLOBAL(JDIMENSION)
+jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+ JDIMENSION num_lines)
+{
+ JDIMENSION lines_per_iMCU_row;
+
+ if (cinfo->global_state != CSTATE_RAW_OK)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ if (cinfo->next_scanline >= cinfo->image_height) {
+ WARNMS(cinfo, JWRN_TOO_MUCH_DATA);
+ return 0;
+ }
+
+ /* Call progress monitor hook if present */
+ if (cinfo->progress != NULL) {
+ cinfo->progress->pass_counter = (long)cinfo->next_scanline;
+ cinfo->progress->pass_limit = (long)cinfo->image_height;
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
+ }
+
+ /* Give master control module another chance if this is first call to
+ * jpeg_write_raw_data. This lets output of the frame/scan headers be
+ * delayed so that application can write COM, etc, markers between
+ * jpeg_start_compress and jpeg_write_raw_data.
+ */
+ if (cinfo->master->call_pass_startup)
+ (*cinfo->master->pass_startup) (cinfo);
+
+ /* Verify that at least one iMCU row has been passed. */
+ lines_per_iMCU_row = cinfo->max_v_samp_factor * DCTSIZE;
+ if (num_lines < lines_per_iMCU_row)
+ ERREXIT(cinfo, JERR_BUFFER_SIZE);
+
+ /* Directly compress the row. */
+ if (!(*cinfo->coef->compress_data) (cinfo, data)) {
+ /* If compressor did not consume the whole row, suspend processing. */
+ return 0;
+ }
+
+ /* OK, we processed one iMCU row. */
+ cinfo->next_scanline += lines_per_iMCU_row;
+ return lines_per_iMCU_row;
+}
diff --git a/media/libjpeg/jcarith.c b/media/libjpeg/jcarith.c
new file mode 100644
index 0000000000..b1720521bf
--- /dev/null
+++ b/media/libjpeg/jcarith.c
@@ -0,0 +1,932 @@
+/*
+ * jcarith.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Developed 1997-2009 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, 2018, 2021-2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains portable arithmetic entropy encoding routines for JPEG
+ * (implementing Recommendation ITU-T T.81 | ISO/IEC 10918-1).
+ *
+ * Both sequential and progressive modes are supported in this single module.
+ *
+ * Suspension is not currently supported in this module.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Expanded entropy encoder object for arithmetic encoding. */
+
+typedef struct {
+ struct jpeg_entropy_encoder pub; /* public fields */
+
+ JLONG c; /* C register, base of coding interval, layout as in sec. D.1.3 */
+ JLONG a; /* A register, normalized size of coding interval */
+ JLONG sc; /* counter for stacked 0xFF values which might overflow */
+ JLONG zc; /* counter for pending 0x00 output values which might *
+ * be discarded at the end ("Pacman" termination) */
+ int ct; /* bit shift counter, determines when next byte will be written */
+ int buffer; /* buffer for most recent output byte != 0xFF */
+
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+ int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
+
+ unsigned int restarts_to_go; /* MCUs left in this restart interval */
+ int next_restart_num; /* next restart number to write (0-7) */
+
+ /* Pointers to statistics areas (these workspaces have image lifespan) */
+ unsigned char *dc_stats[NUM_ARITH_TBLS];
+ unsigned char *ac_stats[NUM_ARITH_TBLS];
+
+ /* Statistics bin for coding with fixed probability 0.5 */
+ unsigned char fixed_bin[4];
+} arith_entropy_encoder;
+
+typedef arith_entropy_encoder *arith_entropy_ptr;
+
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+/* NOTE: Uncomment the following #define if you want to use the
+ * given formula for calculating the AC conditioning parameter Kx
+ * for spectral selection progressive coding in section G.1.3.2
+ * of the spec (Kx = Kmin + SRL (8 + Se - Kmin) 4).
+ * Although the spec and P&M authors claim that this "has proven
+ * to give good results for 8 bit precision samples", I'm not
+ * convinced yet that this is really beneficial.
+ * Early tests gave only very marginal compression enhancements
+ * (a few - around 5 or so - bytes even for very large files),
+ * which would turn out rather negative if we'd suppress the
+ * DAC (Define Arithmetic Conditioning) marker segments for
+ * the default parameters in the future.
+ * Note that currently the marker writing module emits 12-byte
+ * DAC segments for a full-component scan in a color image.
+ * This is not worth worrying about IMHO. However, since the
+ * spec defines the default values to be used if the tables
+ * are omitted (unlike Huffman tables, which are required
+ * anyway), one might optimize this behaviour in the future,
+ * and then it would be disadvantageous to use custom tables if
+ * they don't provide sufficient gain to exceed the DAC size.
+ *
+ * On the other hand, I'd consider it as a reasonable result
+ * that the conditioning has no significant influence on the
+ * compression performance. This means that the basic
+ * statistical model is already rather stable.
+ *
+ * Thus, at the moment, we use the default conditioning values
+ * anyway, and do not use the custom formula.
+ *
+#define CALCULATE_SPECTRAL_CONDITIONING
+ */
+
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than JLONG.
+ * We assume that int right shift is unsigned if JLONG right shift is,
+ * which should be safe.
+ */
+
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS int ishift_temp;
+#define IRIGHT_SHIFT(x, shft) \
+ ((ishift_temp = (x)) < 0 ? \
+ (ishift_temp >> (shft)) | ((~0) << (16 - (shft))) : \
+ (ishift_temp >> (shft)))
+#else
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x, shft) ((x) >> (shft))
+#endif
+
+
+LOCAL(void)
+emit_byte(int val, j_compress_ptr cinfo)
+/* Write next output byte; we do not support suspension in this module. */
+{
+ struct jpeg_destination_mgr *dest = cinfo->dest;
+
+ *dest->next_output_byte++ = (JOCTET)val;
+ if (--dest->free_in_buffer == 0)
+ if (!(*dest->empty_output_buffer) (cinfo))
+ ERREXIT(cinfo, JERR_CANT_SUSPEND);
+}
+
+
+/*
+ * Finish up at the end of an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+finish_pass(j_compress_ptr cinfo)
+{
+ arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
+ JLONG temp;
+
+ /* Section D.1.8: Termination of encoding */
+
+ /* Find the e->c in the coding interval with the largest
+ * number of trailing zero bits */
+ if ((temp = (e->a - 1 + e->c) & 0xFFFF0000UL) < e->c)
+ e->c = temp + 0x8000L;
+ else
+ e->c = temp;
+ /* Send remaining bytes to output */
+ e->c <<= e->ct;
+ if (e->c & 0xF8000000UL) {
+ /* One final overflow has to be handled */
+ if (e->buffer >= 0) {
+ if (e->zc)
+ do emit_byte(0x00, cinfo);
+ while (--e->zc);
+ emit_byte(e->buffer + 1, cinfo);
+ if (e->buffer + 1 == 0xFF)
+ emit_byte(0x00, cinfo);
+ }
+ e->zc += e->sc; /* carry-over converts stacked 0xFF bytes to 0x00 */
+ e->sc = 0;
+ } else {
+ if (e->buffer == 0)
+ ++e->zc;
+ else if (e->buffer >= 0) {
+ if (e->zc)
+ do emit_byte(0x00, cinfo);
+ while (--e->zc);
+ emit_byte(e->buffer, cinfo);
+ }
+ if (e->sc) {
+ if (e->zc)
+ do emit_byte(0x00, cinfo);
+ while (--e->zc);
+ do {
+ emit_byte(0xFF, cinfo);
+ emit_byte(0x00, cinfo);
+ } while (--e->sc);
+ }
+ }
+ /* Output final bytes only if they are not 0x00 */
+ if (e->c & 0x7FFF800L) {
+ if (e->zc) /* output final pending zero bytes */
+ do emit_byte(0x00, cinfo);
+ while (--e->zc);
+ emit_byte((e->c >> 19) & 0xFF, cinfo);
+ if (((e->c >> 19) & 0xFF) == 0xFF)
+ emit_byte(0x00, cinfo);
+ if (e->c & 0x7F800L) {
+ emit_byte((e->c >> 11) & 0xFF, cinfo);
+ if (((e->c >> 11) & 0xFF) == 0xFF)
+ emit_byte(0x00, cinfo);
+ }
+ }
+}
+
+
+/*
+ * The core arithmetic encoding routine (common in JPEG and JBIG).
+ * This needs to go as fast as possible.
+ * Machine-dependent optimization facilities
+ * are not utilized in this portable implementation.
+ * However, this code should be fairly efficient and
+ * may be a good base for further optimizations anyway.
+ *
+ * Parameter 'val' to be encoded may be 0 or 1 (binary decision).
+ *
+ * Note: I've added full "Pacman" termination support to the
+ * byte output routines, which is equivalent to the optional
+ * Discard_final_zeros procedure (Figure D.15) in the spec.
+ * Thus, we always produce the shortest possible output
+ * stream compliant to the spec (no trailing zero bytes,
+ * except for FF stuffing).
+ *
+ * I've also introduced a new scheme for accessing
+ * the probability estimation state machine table,
+ * derived from Markus Kuhn's JBIG implementation.
+ */
+
+LOCAL(void)
+arith_encode(j_compress_ptr cinfo, unsigned char *st, int val)
+{
+ register arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
+ register unsigned char nl, nm;
+ register JLONG qe, temp;
+ register int sv;
+
+ /* Fetch values from our compact representation of Table D.2:
+ * Qe values and probability estimation state machine
+ */
+ sv = *st;
+ qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */
+ nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */
+ nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */
+
+ /* Encode & estimation procedures per sections D.1.4 & D.1.5 */
+ e->a -= qe;
+ if (val != (sv >> 7)) {
+ /* Encode the less probable symbol */
+ if (e->a >= qe) {
+ /* If the interval size (qe) for the less probable symbol (LPS)
+ * is larger than the interval size for the MPS, then exchange
+ * the two symbols for coding efficiency, otherwise code the LPS
+ * as usual: */
+ e->c += e->a;
+ e->a = qe;
+ }
+ *st = (sv & 0x80) ^ nl; /* Estimate_after_LPS */
+ } else {
+ /* Encode the more probable symbol */
+ if (e->a >= 0x8000L)
+ return; /* A >= 0x8000 -> ready, no renormalization required */
+ if (e->a < qe) {
+ /* If the interval size (qe) for the less probable symbol (LPS)
+ * is larger than the interval size for the MPS, then exchange
+ * the two symbols for coding efficiency: */
+ e->c += e->a;
+ e->a = qe;
+ }
+ *st = (sv & 0x80) ^ nm; /* Estimate_after_MPS */
+ }
+
+ /* Renormalization & data output per section D.1.6 */
+ do {
+ e->a <<= 1;
+ e->c <<= 1;
+ if (--e->ct == 0) {
+ /* Another byte is ready for output */
+ temp = e->c >> 19;
+ if (temp > 0xFF) {
+ /* Handle overflow over all stacked 0xFF bytes */
+ if (e->buffer >= 0) {
+ if (e->zc)
+ do emit_byte(0x00, cinfo);
+ while (--e->zc);
+ emit_byte(e->buffer + 1, cinfo);
+ if (e->buffer + 1 == 0xFF)
+ emit_byte(0x00, cinfo);
+ }
+ e->zc += e->sc; /* carry-over converts stacked 0xFF bytes to 0x00 */
+ e->sc = 0;
+ /* Note: The 3 spacer bits in the C register guarantee
+ * that the new buffer byte can't be 0xFF here
+ * (see page 160 in the P&M JPEG book). */
+ e->buffer = temp & 0xFF; /* new output byte, might overflow later */
+ } else if (temp == 0xFF) {
+ ++e->sc; /* stack 0xFF byte (which might overflow later) */
+ } else {
+ /* Output all stacked 0xFF bytes, they will not overflow any more */
+ if (e->buffer == 0)
+ ++e->zc;
+ else if (e->buffer >= 0) {
+ if (e->zc)
+ do emit_byte(0x00, cinfo);
+ while (--e->zc);
+ emit_byte(e->buffer, cinfo);
+ }
+ if (e->sc) {
+ if (e->zc)
+ do emit_byte(0x00, cinfo);
+ while (--e->zc);
+ do {
+ emit_byte(0xFF, cinfo);
+ emit_byte(0x00, cinfo);
+ } while (--e->sc);
+ }
+ e->buffer = temp & 0xFF; /* new output byte (can still overflow) */
+ }
+ e->c &= 0x7FFFFL;
+ e->ct += 8;
+ }
+ } while (e->a < 0x8000L);
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(void)
+emit_restart(j_compress_ptr cinfo, int restart_num)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ int ci;
+ jpeg_component_info *compptr;
+
+ finish_pass(cinfo);
+
+ emit_byte(0xFF, cinfo);
+ emit_byte(JPEG_RST0 + restart_num, cinfo);
+
+ /* Re-initialize statistics areas */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ /* DC needs no table for refinement scan */
+ if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+ memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
+ /* Reset DC predictions to 0 */
+ entropy->last_dc_val[ci] = 0;
+ entropy->dc_context[ci] = 0;
+ }
+ /* AC needs no table when not present */
+ if (cinfo->progressive_mode == 0 || cinfo->Se) {
+ memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
+ }
+ }
+
+ /* Reset arithmetic encoding variables */
+ entropy->c = 0;
+ entropy->a = 0x10000L;
+ entropy->sc = 0;
+ entropy->zc = 0;
+ entropy->ct = 11;
+ entropy->buffer = -1; /* empty */
+}
+
+
+/*
+ * MCU encoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ JBLOCKROW block;
+ unsigned char *st;
+ int blkn, ci, tbl;
+ int v, v2, m;
+ ISHIFT_TEMPS
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ emit_restart(cinfo, entropy->next_restart_num);
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ /* Encode the MCU data blocks */
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ block = MCU_data[blkn];
+ ci = cinfo->MCU_membership[blkn];
+ tbl = cinfo->cur_comp_info[ci]->dc_tbl_no;
+
+ /* Compute the DC value after the required point transform by Al.
+ * This is simply an arithmetic right shift.
+ */
+ m = IRIGHT_SHIFT((int)((*block)[0]), cinfo->Al);
+
+ /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
+
+ /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+ st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+ /* Figure F.4: Encode_DC_DIFF */
+ if ((v = m - entropy->last_dc_val[ci]) == 0) {
+ arith_encode(cinfo, st, 0);
+ entropy->dc_context[ci] = 0; /* zero diff category */
+ } else {
+ entropy->last_dc_val[ci] = m;
+ arith_encode(cinfo, st, 1);
+ /* Figure F.6: Encoding nonzero value v */
+ /* Figure F.7: Encoding the sign of v */
+ if (v > 0) {
+ arith_encode(cinfo, st + 1, 0); /* Table F.4: SS = S0 + 1 */
+ st += 2; /* Table F.4: SP = S0 + 2 */
+ entropy->dc_context[ci] = 4; /* small positive diff category */
+ } else {
+ v = -v;
+ arith_encode(cinfo, st + 1, 1); /* Table F.4: SS = S0 + 1 */
+ st += 3; /* Table F.4: SN = S0 + 3 */
+ entropy->dc_context[ci] = 8; /* small negative diff category */
+ }
+ /* Figure F.8: Encoding the magnitude category of v */
+ m = 0;
+ if (v -= 1) {
+ arith_encode(cinfo, st, 1);
+ m = 1;
+ v2 = v;
+ st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+ while (v2 >>= 1) {
+ arith_encode(cinfo, st, 1);
+ m <<= 1;
+ st += 1;
+ }
+ }
+ arith_encode(cinfo, st, 0);
+ /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+ if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
+ entropy->dc_context[ci] = 0; /* zero diff category */
+ else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
+ entropy->dc_context[ci] += 8; /* large diff category */
+ /* Figure F.9: Encoding the magnitude bit pattern of v */
+ st += 14;
+ while (m >>= 1)
+ arith_encode(cinfo, st, (m & v) ? 1 : 0);
+ }
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ JBLOCKROW block;
+ unsigned char *st;
+ int tbl, k, ke;
+ int v, v2, m;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ emit_restart(cinfo, entropy->next_restart_num);
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ /* Encode the MCU data block */
+ block = MCU_data[0];
+ tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+ /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
+
+ /* Establish EOB (end-of-block) index */
+ for (ke = cinfo->Se; ke > 0; ke--)
+ /* We must apply the point transform by Al. For AC coefficients this
+ * is an integer division with rounding towards 0. To do this portably
+ * in C, we shift after obtaining the absolute value.
+ */
+ if ((v = (*block)[jpeg_natural_order[ke]]) >= 0) {
+ if (v >>= cinfo->Al) break;
+ } else {
+ v = -v;
+ if (v >>= cinfo->Al) break;
+ }
+
+ /* Figure F.5: Encode_AC_Coefficients */
+ for (k = cinfo->Ss; k <= ke; k++) {
+ st = entropy->ac_stats[tbl] + 3 * (k - 1);
+ arith_encode(cinfo, st, 0); /* EOB decision */
+ for (;;) {
+ if ((v = (*block)[jpeg_natural_order[k]]) >= 0) {
+ if (v >>= cinfo->Al) {
+ arith_encode(cinfo, st + 1, 1);
+ arith_encode(cinfo, entropy->fixed_bin, 0);
+ break;
+ }
+ } else {
+ v = -v;
+ if (v >>= cinfo->Al) {
+ arith_encode(cinfo, st + 1, 1);
+ arith_encode(cinfo, entropy->fixed_bin, 1);
+ break;
+ }
+ }
+ arith_encode(cinfo, st + 1, 0); st += 3; k++;
+ }
+ st += 2;
+ /* Figure F.8: Encoding the magnitude category of v */
+ m = 0;
+ if (v -= 1) {
+ arith_encode(cinfo, st, 1);
+ m = 1;
+ v2 = v;
+ if (v2 >>= 1) {
+ arith_encode(cinfo, st, 1);
+ m <<= 1;
+ st = entropy->ac_stats[tbl] +
+ (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+ while (v2 >>= 1) {
+ arith_encode(cinfo, st, 1);
+ m <<= 1;
+ st += 1;
+ }
+ }
+ }
+ arith_encode(cinfo, st, 0);
+ /* Figure F.9: Encoding the magnitude bit pattern of v */
+ st += 14;
+ while (m >>= 1)
+ arith_encode(cinfo, st, (m & v) ? 1 : 0);
+ }
+ /* Encode EOB decision only if k <= cinfo->Se */
+ if (k <= cinfo->Se) {
+ st = entropy->ac_stats[tbl] + 3 * (k - 1);
+ arith_encode(cinfo, st, 1);
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * MCU encoding for DC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ unsigned char *st;
+ int Al, blkn;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ emit_restart(cinfo, entropy->next_restart_num);
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ st = entropy->fixed_bin; /* use fixed probability estimation */
+ Al = cinfo->Al;
+
+ /* Encode the MCU data blocks */
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ /* We simply emit the Al'th bit of the DC coefficient value. */
+ arith_encode(cinfo, st, (MCU_data[blkn][0][0] >> Al) & 1);
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ JBLOCKROW block;
+ unsigned char *st;
+ int tbl, k, ke, kex;
+ int v;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ emit_restart(cinfo, entropy->next_restart_num);
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ /* Encode the MCU data block */
+ block = MCU_data[0];
+ tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+ /* Section G.1.3.3: Encoding of AC coefficients */
+
+ /* Establish EOB (end-of-block) index */
+ for (ke = cinfo->Se; ke > 0; ke--)
+ /* We must apply the point transform by Al. For AC coefficients this
+ * is an integer division with rounding towards 0. To do this portably
+ * in C, we shift after obtaining the absolute value.
+ */
+ if ((v = (*block)[jpeg_natural_order[ke]]) >= 0) {
+ if (v >>= cinfo->Al) break;
+ } else {
+ v = -v;
+ if (v >>= cinfo->Al) break;
+ }
+
+ /* Establish EOBx (previous stage end-of-block) index */
+ for (kex = ke; kex > 0; kex--)
+ if ((v = (*block)[jpeg_natural_order[kex]]) >= 0) {
+ if (v >>= cinfo->Ah) break;
+ } else {
+ v = -v;
+ if (v >>= cinfo->Ah) break;
+ }
+
+ /* Figure G.10: Encode_AC_Coefficients_SA */
+ for (k = cinfo->Ss; k <= ke; k++) {
+ st = entropy->ac_stats[tbl] + 3 * (k - 1);
+ if (k > kex)
+ arith_encode(cinfo, st, 0); /* EOB decision */
+ for (;;) {
+ if ((v = (*block)[jpeg_natural_order[k]]) >= 0) {
+ if (v >>= cinfo->Al) {
+ if (v >> 1) /* previously nonzero coef */
+ arith_encode(cinfo, st + 2, (v & 1));
+ else { /* newly nonzero coef */
+ arith_encode(cinfo, st + 1, 1);
+ arith_encode(cinfo, entropy->fixed_bin, 0);
+ }
+ break;
+ }
+ } else {
+ v = -v;
+ if (v >>= cinfo->Al) {
+ if (v >> 1) /* previously nonzero coef */
+ arith_encode(cinfo, st + 2, (v & 1));
+ else { /* newly nonzero coef */
+ arith_encode(cinfo, st + 1, 1);
+ arith_encode(cinfo, entropy->fixed_bin, 1);
+ }
+ break;
+ }
+ }
+ arith_encode(cinfo, st + 1, 0); st += 3; k++;
+ }
+ }
+ /* Encode EOB decision only if k <= cinfo->Se */
+ if (k <= cinfo->Se) {
+ st = entropy->ac_stats[tbl] + 3 * (k - 1);
+ arith_encode(cinfo, st, 1);
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * Encode and output one MCU's worth of arithmetic-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+encode_mcu(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ jpeg_component_info *compptr;
+ JBLOCKROW block;
+ unsigned char *st;
+ int blkn, ci, tbl, k, ke;
+ int v, v2, m;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ emit_restart(cinfo, entropy->next_restart_num);
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ /* Encode the MCU data blocks */
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ block = MCU_data[blkn];
+ ci = cinfo->MCU_membership[blkn];
+ compptr = cinfo->cur_comp_info[ci];
+
+ /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
+
+ tbl = compptr->dc_tbl_no;
+
+ /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+ st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+ /* Figure F.4: Encode_DC_DIFF */
+ if ((v = (*block)[0] - entropy->last_dc_val[ci]) == 0) {
+ arith_encode(cinfo, st, 0);
+ entropy->dc_context[ci] = 0; /* zero diff category */
+ } else {
+ entropy->last_dc_val[ci] = (*block)[0];
+ arith_encode(cinfo, st, 1);
+ /* Figure F.6: Encoding nonzero value v */
+ /* Figure F.7: Encoding the sign of v */
+ if (v > 0) {
+ arith_encode(cinfo, st + 1, 0); /* Table F.4: SS = S0 + 1 */
+ st += 2; /* Table F.4: SP = S0 + 2 */
+ entropy->dc_context[ci] = 4; /* small positive diff category */
+ } else {
+ v = -v;
+ arith_encode(cinfo, st + 1, 1); /* Table F.4: SS = S0 + 1 */
+ st += 3; /* Table F.4: SN = S0 + 3 */
+ entropy->dc_context[ci] = 8; /* small negative diff category */
+ }
+ /* Figure F.8: Encoding the magnitude category of v */
+ m = 0;
+ if (v -= 1) {
+ arith_encode(cinfo, st, 1);
+ m = 1;
+ v2 = v;
+ st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+ while (v2 >>= 1) {
+ arith_encode(cinfo, st, 1);
+ m <<= 1;
+ st += 1;
+ }
+ }
+ arith_encode(cinfo, st, 0);
+ /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+ if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
+ entropy->dc_context[ci] = 0; /* zero diff category */
+ else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
+ entropy->dc_context[ci] += 8; /* large diff category */
+ /* Figure F.9: Encoding the magnitude bit pattern of v */
+ st += 14;
+ while (m >>= 1)
+ arith_encode(cinfo, st, (m & v) ? 1 : 0);
+ }
+
+ /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
+
+ tbl = compptr->ac_tbl_no;
+
+ /* Establish EOB (end-of-block) index */
+ for (ke = DCTSIZE2 - 1; ke > 0; ke--)
+ if ((*block)[jpeg_natural_order[ke]]) break;
+
+ /* Figure F.5: Encode_AC_Coefficients */
+ for (k = 1; k <= ke; k++) {
+ st = entropy->ac_stats[tbl] + 3 * (k - 1);
+ arith_encode(cinfo, st, 0); /* EOB decision */
+ while ((v = (*block)[jpeg_natural_order[k]]) == 0) {
+ arith_encode(cinfo, st + 1, 0); st += 3; k++;
+ }
+ arith_encode(cinfo, st + 1, 1);
+ /* Figure F.6: Encoding nonzero value v */
+ /* Figure F.7: Encoding the sign of v */
+ if (v > 0) {
+ arith_encode(cinfo, entropy->fixed_bin, 0);
+ } else {
+ v = -v;
+ arith_encode(cinfo, entropy->fixed_bin, 1);
+ }
+ st += 2;
+ /* Figure F.8: Encoding the magnitude category of v */
+ m = 0;
+ if (v -= 1) {
+ arith_encode(cinfo, st, 1);
+ m = 1;
+ v2 = v;
+ if (v2 >>= 1) {
+ arith_encode(cinfo, st, 1);
+ m <<= 1;
+ st = entropy->ac_stats[tbl] +
+ (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+ while (v2 >>= 1) {
+ arith_encode(cinfo, st, 1);
+ m <<= 1;
+ st += 1;
+ }
+ }
+ }
+ arith_encode(cinfo, st, 0);
+ /* Figure F.9: Encoding the magnitude bit pattern of v */
+ st += 14;
+ while (m >>= 1)
+ arith_encode(cinfo, st, (m & v) ? 1 : 0);
+ }
+ /* Encode EOB decision only if k <= DCTSIZE2 - 1 */
+ if (k <= DCTSIZE2 - 1) {
+ st = entropy->ac_stats[tbl] + 3 * (k - 1);
+ arith_encode(cinfo, st, 1);
+ }
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * Initialize for an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass(j_compress_ptr cinfo, boolean gather_statistics)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ int ci, tbl;
+ jpeg_component_info *compptr;
+
+ if (gather_statistics)
+ /* Make sure to avoid that in the master control logic!
+ * We are fully adaptive here and need no extra
+ * statistics gathering pass!
+ */
+ ERREXIT(cinfo, JERR_NOTIMPL);
+
+ /* We assume jcmaster.c already validated the progressive scan parameters. */
+
+ /* Select execution routines */
+ if (cinfo->progressive_mode) {
+ if (cinfo->Ah == 0) {
+ if (cinfo->Ss == 0)
+ entropy->pub.encode_mcu = encode_mcu_DC_first;
+ else
+ entropy->pub.encode_mcu = encode_mcu_AC_first;
+ } else {
+ if (cinfo->Ss == 0)
+ entropy->pub.encode_mcu = encode_mcu_DC_refine;
+ else
+ entropy->pub.encode_mcu = encode_mcu_AC_refine;
+ }
+ } else
+ entropy->pub.encode_mcu = encode_mcu;
+
+ /* Allocate & initialize requested statistics areas */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ /* DC needs no table for refinement scan */
+ if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+ tbl = compptr->dc_tbl_no;
+ if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+ ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+ if (entropy->dc_stats[tbl] == NULL)
+ entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+ memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
+ /* Initialize DC predictions to 0 */
+ entropy->last_dc_val[ci] = 0;
+ entropy->dc_context[ci] = 0;
+ }
+ /* AC needs no table when not present */
+ if (cinfo->progressive_mode == 0 || cinfo->Se) {
+ tbl = compptr->ac_tbl_no;
+ if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+ ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+ if (entropy->ac_stats[tbl] == NULL)
+ entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+ memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
+#ifdef CALCULATE_SPECTRAL_CONDITIONING
+ if (cinfo->progressive_mode)
+ /* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
+ cinfo->arith_ac_K[tbl] = cinfo->Ss +
+ ((8 + cinfo->Se - cinfo->Ss) >> 4);
+#endif
+ }
+ }
+
+ /* Initialize arithmetic encoding variables */
+ entropy->c = 0;
+ entropy->a = 0x10000L;
+ entropy->sc = 0;
+ entropy->zc = 0;
+ entropy->ct = 11;
+ entropy->buffer = -1; /* empty */
+
+ /* Initialize restart stuff */
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num = 0;
+}
+
+
+/*
+ * Module initialization routine for arithmetic entropy encoding.
+ */
+
+GLOBAL(void)
+jinit_arith_encoder(j_compress_ptr cinfo)
+{
+ arith_entropy_ptr entropy;
+ int i;
+
+ entropy = (arith_entropy_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(arith_entropy_encoder));
+ cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
+ entropy->pub.start_pass = start_pass;
+ entropy->pub.finish_pass = finish_pass;
+
+ /* Mark tables unallocated */
+ for (i = 0; i < NUM_ARITH_TBLS; i++) {
+ entropy->dc_stats[i] = NULL;
+ entropy->ac_stats[i] = NULL;
+ }
+
+ /* Initialize index for fixed probability estimation */
+ entropy->fixed_bin[0] = 113;
+}
diff --git a/media/libjpeg/jccoefct.c b/media/libjpeg/jccoefct.c
new file mode 100644
index 0000000000..068232a527
--- /dev/null
+++ b/media/libjpeg/jccoefct.c
@@ -0,0 +1,449 @@
+/*
+ * jccoefct.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * It was modified by The libjpeg-turbo Project to include only code and
+ * information relevant to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the coefficient buffer controller for compression.
+ * This controller is the top level of the JPEG compressor proper.
+ * The coefficient buffer lies between forward-DCT and entropy encoding steps.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* We use a full-image coefficient buffer when doing Huffman optimization,
+ * and also for writing multiple-scan JPEG files. In all cases, the DCT
+ * step is run during the first pass, and subsequent passes need only read
+ * the buffered coefficients.
+ */
+#ifdef ENTROPY_OPT_SUPPORTED
+#define FULL_COEF_BUFFER_SUPPORTED
+#else
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+#define FULL_COEF_BUFFER_SUPPORTED
+#endif
+#endif
+
+
+/* Private buffer controller object */
+
+typedef struct {
+ struct jpeg_c_coef_controller pub; /* public fields */
+
+ JDIMENSION iMCU_row_num; /* iMCU row # within image */
+ JDIMENSION mcu_ctr; /* counts MCUs processed in current row */
+ int MCU_vert_offset; /* counts MCU rows within iMCU row */
+ int MCU_rows_per_iMCU_row; /* number of such rows needed */
+
+ /* For single-pass compression, it's sufficient to buffer just one MCU
+ * (although this may prove a bit slow in practice). We allocate a
+ * workspace of C_MAX_BLOCKS_IN_MCU coefficient blocks, and reuse it for each
+ * MCU constructed and sent. In multi-pass modes, this array points to the
+ * current MCU's blocks within the virtual arrays.
+ */
+ JBLOCKROW MCU_buffer[C_MAX_BLOCKS_IN_MCU];
+
+ /* In multi-pass modes, we need a virtual block array for each component. */
+ jvirt_barray_ptr whole_image[MAX_COMPONENTS];
+} my_coef_controller;
+
+typedef my_coef_controller *my_coef_ptr;
+
+
+/* Forward declarations */
+METHODDEF(boolean) compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+#ifdef FULL_COEF_BUFFER_SUPPORTED
+METHODDEF(boolean) compress_first_pass(j_compress_ptr cinfo,
+ JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+#endif
+
+
+LOCAL(void)
+start_iMCU_row(j_compress_ptr cinfo)
+/* Reset within-iMCU-row counters for a new row */
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+
+ /* In an interleaved scan, an MCU row is the same as an iMCU row.
+ * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
+ * But at the bottom of the image, process only what's left.
+ */
+ if (cinfo->comps_in_scan > 1) {
+ coef->MCU_rows_per_iMCU_row = 1;
+ } else {
+ if (coef->iMCU_row_num < (cinfo->total_iMCU_rows - 1))
+ coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
+ else
+ coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
+ }
+
+ coef->mcu_ctr = 0;
+ coef->MCU_vert_offset = 0;
+}
+
+
+/*
+ * Initialize for a processing pass.
+ */
+
+METHODDEF(void)
+start_pass_coef(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+
+ coef->iMCU_row_num = 0;
+ start_iMCU_row(cinfo);
+
+ switch (pass_mode) {
+ case JBUF_PASS_THRU:
+ if (coef->whole_image[0] != NULL)
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+ coef->pub.compress_data = compress_data;
+ break;
+#ifdef FULL_COEF_BUFFER_SUPPORTED
+ case JBUF_SAVE_AND_PASS:
+ if (coef->whole_image[0] == NULL)
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+ coef->pub.compress_data = compress_first_pass;
+ break;
+ case JBUF_CRANK_DEST:
+ if (coef->whole_image[0] == NULL)
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+ coef->pub.compress_data = compress_output;
+ break;
+#endif
+ default:
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+ break;
+ }
+}
+
+
+/*
+ * Process some data in the single-pass case.
+ * We process the equivalent of one fully interleaved MCU row ("iMCU" row)
+ * per call, ie, v_samp_factor block rows for each component in the image.
+ * Returns TRUE if the iMCU row is completed, FALSE if suspended.
+ *
+ * NB: input_buf contains a plane for each component in image,
+ * which we index according to the component's SOF position.
+ */
+
+METHODDEF(boolean)
+compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+ JDIMENSION MCU_col_num; /* index of current MCU within row */
+ JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
+ JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
+ int blkn, bi, ci, yindex, yoffset, blockcnt;
+ JDIMENSION ypos, xpos;
+ jpeg_component_info *compptr;
+
+ /* Loop to write as much as one whole iMCU row */
+ for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
+ yoffset++) {
+ for (MCU_col_num = coef->mcu_ctr; MCU_col_num <= last_MCU_col;
+ MCU_col_num++) {
+ /* Determine where data comes from in input_buf and do the DCT thing.
+ * Each call on forward_DCT processes a horizontal row of DCT blocks
+ * as wide as an MCU; we rely on having allocated the MCU_buffer[] blocks
+ * sequentially. Dummy blocks at the right or bottom edge are filled in
+ * specially. The data in them does not matter for image reconstruction,
+ * so we fill them with values that will encode to the smallest amount of
+ * data, viz: all zeroes in the AC entries, DC entries equal to previous
+ * block's DC value. (Thanks to Thomas Kinsman for this idea.)
+ */
+ blkn = 0;
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width :
+ compptr->last_col_width;
+ xpos = MCU_col_num * compptr->MCU_sample_width;
+ ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */
+ for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+ if (coef->iMCU_row_num < last_iMCU_row ||
+ yoffset + yindex < compptr->last_row_height) {
+ (*cinfo->fdct->forward_DCT) (cinfo, compptr,
+ input_buf[compptr->component_index],
+ coef->MCU_buffer[blkn],
+ ypos, xpos, (JDIMENSION)blockcnt);
+ if (blockcnt < compptr->MCU_width) {
+ /* Create some dummy blocks at the right edge of the image. */
+ jzero_far((void *)coef->MCU_buffer[blkn + blockcnt],
+ (compptr->MCU_width - blockcnt) * sizeof(JBLOCK));
+ for (bi = blockcnt; bi < compptr->MCU_width; bi++) {
+ coef->MCU_buffer[blkn + bi][0][0] =
+ coef->MCU_buffer[blkn + bi - 1][0][0];
+ }
+ }
+ } else {
+ /* Create a row of dummy blocks at the bottom of the image. */
+ jzero_far((void *)coef->MCU_buffer[blkn],
+ compptr->MCU_width * sizeof(JBLOCK));
+ for (bi = 0; bi < compptr->MCU_width; bi++) {
+ coef->MCU_buffer[blkn + bi][0][0] =
+ coef->MCU_buffer[blkn - 1][0][0];
+ }
+ }
+ blkn += compptr->MCU_width;
+ ypos += DCTSIZE;
+ }
+ }
+ /* Try to write the MCU. In event of a suspension failure, we will
+ * re-DCT the MCU on restart (a bit inefficient, could be fixed...)
+ */
+ if (!(*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
+ /* Suspension forced; update state counters and exit */
+ coef->MCU_vert_offset = yoffset;
+ coef->mcu_ctr = MCU_col_num;
+ return FALSE;
+ }
+ }
+ /* Completed an MCU row, but perhaps not an iMCU row */
+ coef->mcu_ctr = 0;
+ }
+ /* Completed the iMCU row, advance counters for next one */
+ coef->iMCU_row_num++;
+ start_iMCU_row(cinfo);
+ return TRUE;
+}
+
+
+#ifdef FULL_COEF_BUFFER_SUPPORTED
+
+/*
+ * Process some data in the first pass of a multi-pass case.
+ * We process the equivalent of one fully interleaved MCU row ("iMCU" row)
+ * per call, ie, v_samp_factor block rows for each component in the image.
+ * This amount of data is read from the source buffer, DCT'd and quantized,
+ * and saved into the virtual arrays. We also generate suitable dummy blocks
+ * as needed at the right and lower edges. (The dummy blocks are constructed
+ * in the virtual arrays, which have been padded appropriately.) This makes
+ * it possible for subsequent passes not to worry about real vs. dummy blocks.
+ *
+ * We must also emit the data to the entropy encoder. This is conveniently
+ * done by calling compress_output() after we've loaded the current strip
+ * of the virtual arrays.
+ *
+ * NB: input_buf contains a plane for each component in image. All
+ * components are DCT'd and loaded into the virtual arrays in this pass.
+ * However, it may be that only a subset of the components are emitted to
+ * the entropy encoder during this first pass; be careful about looking
+ * at the scan-dependent variables (MCU dimensions, etc).
+ */
+
+METHODDEF(boolean)
+compress_first_pass(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+ JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
+ JDIMENSION blocks_across, MCUs_across, MCUindex;
+ int bi, ci, h_samp_factor, block_row, block_rows, ndummy;
+ JCOEF lastDC;
+ jpeg_component_info *compptr;
+ JBLOCKARRAY buffer;
+ JBLOCKROW thisblockrow, lastblockrow;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ /* Align the virtual buffer for this component. */
+ buffer = (*cinfo->mem->access_virt_barray)
+ ((j_common_ptr)cinfo, coef->whole_image[ci],
+ coef->iMCU_row_num * compptr->v_samp_factor,
+ (JDIMENSION)compptr->v_samp_factor, TRUE);
+ /* Count non-dummy DCT block rows in this iMCU row. */
+ if (coef->iMCU_row_num < last_iMCU_row)
+ block_rows = compptr->v_samp_factor;
+ else {
+ /* NB: can't use last_row_height here, since may not be set! */
+ block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
+ if (block_rows == 0) block_rows = compptr->v_samp_factor;
+ }
+ blocks_across = compptr->width_in_blocks;
+ h_samp_factor = compptr->h_samp_factor;
+ /* Count number of dummy blocks to be added at the right margin. */
+ ndummy = (int)(blocks_across % h_samp_factor);
+ if (ndummy > 0)
+ ndummy = h_samp_factor - ndummy;
+ /* Perform DCT for all non-dummy blocks in this iMCU row. Each call
+ * on forward_DCT processes a complete horizontal row of DCT blocks.
+ */
+ for (block_row = 0; block_row < block_rows; block_row++) {
+ thisblockrow = buffer[block_row];
+ (*cinfo->fdct->forward_DCT) (cinfo, compptr,
+ input_buf[ci], thisblockrow,
+ (JDIMENSION)(block_row * DCTSIZE),
+ (JDIMENSION)0, blocks_across);
+ if (ndummy > 0) {
+ /* Create dummy blocks at the right edge of the image. */
+ thisblockrow += blocks_across; /* => first dummy block */
+ jzero_far((void *)thisblockrow, ndummy * sizeof(JBLOCK));
+ lastDC = thisblockrow[-1][0];
+ for (bi = 0; bi < ndummy; bi++) {
+ thisblockrow[bi][0] = lastDC;
+ }
+ }
+ }
+ /* If at end of image, create dummy block rows as needed.
+ * The tricky part here is that within each MCU, we want the DC values
+ * of the dummy blocks to match the last real block's DC value.
+ * This squeezes a few more bytes out of the resulting file...
+ */
+ if (coef->iMCU_row_num == last_iMCU_row) {
+ blocks_across += ndummy; /* include lower right corner */
+ MCUs_across = blocks_across / h_samp_factor;
+ for (block_row = block_rows; block_row < compptr->v_samp_factor;
+ block_row++) {
+ thisblockrow = buffer[block_row];
+ lastblockrow = buffer[block_row - 1];
+ jzero_far((void *)thisblockrow,
+ (size_t)(blocks_across * sizeof(JBLOCK)));
+ for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
+ lastDC = lastblockrow[h_samp_factor - 1][0];
+ for (bi = 0; bi < h_samp_factor; bi++) {
+ thisblockrow[bi][0] = lastDC;
+ }
+ thisblockrow += h_samp_factor; /* advance to next MCU in row */
+ lastblockrow += h_samp_factor;
+ }
+ }
+ }
+ }
+ /* NB: compress_output will increment iMCU_row_num if successful.
+ * A suspension return will result in redoing all the work above next time.
+ */
+
+ /* Emit data to the entropy encoder, sharing code with subsequent passes */
+ return compress_output(cinfo, input_buf);
+}
+
+
+/*
+ * Process some data in subsequent passes of a multi-pass case.
+ * We process the equivalent of one fully interleaved MCU row ("iMCU" row)
+ * per call, ie, v_samp_factor block rows for each component in the scan.
+ * The data is obtained from the virtual arrays and fed to the entropy coder.
+ * Returns TRUE if the iMCU row is completed, FALSE if suspended.
+ *
+ * NB: input_buf is ignored; it is likely to be a NULL pointer.
+ */
+
+METHODDEF(boolean)
+compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+ JDIMENSION MCU_col_num; /* index of current MCU within row */
+ int blkn, ci, xindex, yindex, yoffset;
+ JDIMENSION start_col;
+ JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN];
+ JBLOCKROW buffer_ptr;
+ jpeg_component_info *compptr;
+
+ /* Align the virtual buffers for the components used in this scan.
+ * NB: during first pass, this is safe only because the buffers will
+ * already be aligned properly, so jmemmgr.c won't need to do any I/O.
+ */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ buffer[ci] = (*cinfo->mem->access_virt_barray)
+ ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
+ coef->iMCU_row_num * compptr->v_samp_factor,
+ (JDIMENSION)compptr->v_samp_factor, FALSE);
+ }
+
+ /* Loop to process one whole iMCU row */
+ for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
+ yoffset++) {
+ for (MCU_col_num = coef->mcu_ctr; MCU_col_num < cinfo->MCUs_per_row;
+ MCU_col_num++) {
+ /* Construct list of pointers to DCT blocks belonging to this MCU */
+ blkn = 0; /* index of current DCT block within MCU */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ start_col = MCU_col_num * compptr->MCU_width;
+ for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+ buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
+ for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
+ coef->MCU_buffer[blkn++] = buffer_ptr++;
+ }
+ }
+ }
+ /* Try to write the MCU. */
+ if (!(*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
+ /* Suspension forced; update state counters and exit */
+ coef->MCU_vert_offset = yoffset;
+ coef->mcu_ctr = MCU_col_num;
+ return FALSE;
+ }
+ }
+ /* Completed an MCU row, but perhaps not an iMCU row */
+ coef->mcu_ctr = 0;
+ }
+ /* Completed the iMCU row, advance counters for next one */
+ coef->iMCU_row_num++;
+ start_iMCU_row(cinfo);
+ return TRUE;
+}
+
+#endif /* FULL_COEF_BUFFER_SUPPORTED */
+
+
+/*
+ * Initialize coefficient buffer controller.
+ */
+
+GLOBAL(void)
+jinit_c_coef_controller(j_compress_ptr cinfo, boolean need_full_buffer)
+{
+ my_coef_ptr coef;
+
+ coef = (my_coef_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_coef_controller));
+ cinfo->coef = (struct jpeg_c_coef_controller *)coef;
+ coef->pub.start_pass = start_pass_coef;
+
+ /* Create the coefficient buffer. */
+ if (need_full_buffer) {
+#ifdef FULL_COEF_BUFFER_SUPPORTED
+ /* Allocate a full-image virtual array for each component, */
+ /* padded to a multiple of samp_factor DCT blocks in each direction. */
+ int ci;
+ jpeg_component_info *compptr;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+ (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+ (long)compptr->h_samp_factor),
+ (JDIMENSION)jround_up((long)compptr->height_in_blocks,
+ (long)compptr->v_samp_factor),
+ (JDIMENSION)compptr->v_samp_factor);
+ }
+#else
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+#endif
+ } else {
+ /* We only need a single-MCU buffer. */
+ JBLOCKROW buffer;
+ int i;
+
+ buffer = (JBLOCKROW)
+ (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
+ for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
+ coef->MCU_buffer[i] = buffer + i;
+ }
+ coef->whole_image[0] = NULL; /* flag for no virtual arrays */
+ }
+}
diff --git a/media/libjpeg/jccolext.c b/media/libjpeg/jccolext.c
new file mode 100644
index 0000000000..20f891a633
--- /dev/null
+++ b/media/libjpeg/jccolext.c
@@ -0,0 +1,144 @@
+/*
+ * jccolext.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009-2012, 2015, 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains input colorspace conversion routines.
+ */
+
+
+/* This file is included by jccolor.c */
+
+
+/*
+ * Convert some rows of samples to the JPEG colorspace.
+ *
+ * Note that we change from the application's interleaved-pixel format
+ * to our internal noninterleaved, one-plane-per-component format.
+ * The input buffer is therefore three times as wide as the output buffer.
+ *
+ * A starting row offset is provided only for the output buffer. The caller
+ * can easily adjust the passed input_buf value to accommodate any row
+ * offset required on that side.
+ */
+
+INLINE
+LOCAL(void)
+rgb_ycc_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
+ register int r, g, b;
+ register JLONG *ctab = cconvert->rgb_ycc_tab;
+ register JSAMPROW inptr;
+ register JSAMPROW outptr0, outptr1, outptr2;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->image_width;
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+ for (col = 0; col < num_cols; col++) {
+ r = RANGE_LIMIT(inptr[RGB_RED]);
+ g = RANGE_LIMIT(inptr[RGB_GREEN]);
+ b = RANGE_LIMIT(inptr[RGB_BLUE]);
+ inptr += RGB_PIXELSIZE;
+ /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
+ * must be too; we do not need an explicit range-limiting operation.
+ * Hence the value being shifted is never negative, and we don't
+ * need the general RIGHT_SHIFT macro.
+ */
+ /* Y */
+ outptr0[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+ ctab[b + B_Y_OFF]) >> SCALEBITS);
+ /* Cb */
+ outptr1[col] = (JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
+ ctab[b + B_CB_OFF]) >> SCALEBITS);
+ /* Cr */
+ outptr2[col] = (JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
+ ctab[b + B_CR_OFF]) >> SCALEBITS);
+ }
+ }
+}
+
+
+/**************** Cases other than RGB -> YCbCr **************/
+
+
+/*
+ * Convert some rows of samples to the JPEG colorspace.
+ * This version handles RGB->grayscale conversion, which is the same
+ * as the RGB->Y portion of RGB->YCbCr.
+ * We assume rgb_ycc_start has been called (we only use the Y tables).
+ */
+
+INLINE
+LOCAL(void)
+rgb_gray_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
+ register int r, g, b;
+ register JLONG *ctab = cconvert->rgb_ycc_tab;
+ register JSAMPROW inptr;
+ register JSAMPROW outptr;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->image_width;
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr = output_buf[0][output_row];
+ output_row++;
+ for (col = 0; col < num_cols; col++) {
+ r = RANGE_LIMIT(inptr[RGB_RED]);
+ g = RANGE_LIMIT(inptr[RGB_GREEN]);
+ b = RANGE_LIMIT(inptr[RGB_BLUE]);
+ inptr += RGB_PIXELSIZE;
+ /* Y */
+ outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+ ctab[b + B_Y_OFF]) >> SCALEBITS);
+ }
+ }
+}
+
+
+/*
+ * Convert some rows of samples to the JPEG colorspace.
+ * This version handles extended RGB->plain RGB conversion
+ */
+
+INLINE
+LOCAL(void)
+rgb_rgb_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ register JSAMPROW inptr;
+ register JSAMPROW outptr0, outptr1, outptr2;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->image_width;
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+ for (col = 0; col < num_cols; col++) {
+ outptr0[col] = inptr[RGB_RED];
+ outptr1[col] = inptr[RGB_GREEN];
+ outptr2[col] = inptr[RGB_BLUE];
+ inptr += RGB_PIXELSIZE;
+ }
+ }
+}
diff --git a/media/libjpeg/jccolor.c b/media/libjpeg/jccolor.c
new file mode 100644
index 0000000000..fb9f1cc192
--- /dev/null
+++ b/media/libjpeg/jccolor.c
@@ -0,0 +1,721 @@
+/*
+ * jccolor.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2012, 2015, 2022, D. R. Commander.
+ * Copyright (C) 2014, MIPS Technologies, Inc., California.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains input colorspace conversion routines.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jsimd.h"
+
+
+/* Private subobject */
+
+typedef struct {
+ struct jpeg_color_converter pub; /* public fields */
+
+ /* Private state for RGB->YCC conversion */
+ JLONG *rgb_ycc_tab; /* => table for RGB to YCbCr conversion */
+} my_color_converter;
+
+typedef my_color_converter *my_cconvert_ptr;
+
+
+/**************** RGB -> YCbCr conversion: most common case **************/
+
+/*
+ * YCbCr is defined per CCIR 601-1, except that Cb and Cr are
+ * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5.
+ * The conversion equations to be implemented are therefore
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.)
+ * Note: older versions of the IJG code used a zero offset of MAXJSAMPLE/2,
+ * rather than CENTERJSAMPLE, for Cb and Cr. This gave equal positive and
+ * negative swings for Cb/Cr, but meant that grayscale values (Cb=Cr=0)
+ * were not represented exactly. Now we sacrifice exact representation of
+ * maximum red and maximum blue in order to get exact grayscales.
+ *
+ * To avoid floating-point arithmetic, we represent the fractional constants
+ * as integers scaled up by 2^16 (about 4 digits precision); we have to divide
+ * the products by 2^16, with appropriate rounding, to get the correct answer.
+ *
+ * For even more speed, we avoid doing any multiplications in the inner loop
+ * by precalculating the constants times R,G,B for all possible values.
+ * For 8-bit JSAMPLEs this is very reasonable (only 256 entries per table);
+ * for 12-bit samples it is still acceptable. It's not very reasonable for
+ * 16-bit samples, but if you want lossless storage you shouldn't be changing
+ * colorspace anyway.
+ * The CENTERJSAMPLE offsets and the rounding fudge-factor of 0.5 are included
+ * in the tables to save adding them separately in the inner loop.
+ */
+
+#define SCALEBITS 16 /* speediest right-shift on some machines */
+#define CBCR_OFFSET ((JLONG)CENTERJSAMPLE << SCALEBITS)
+#define ONE_HALF ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x) ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
+
+/* We allocate one big table and divide it up into eight parts, instead of
+ * doing eight alloc_small requests. This lets us use a single table base
+ * address, which can be held in a register in the inner loops on many
+ * machines (more than can hold all eight addresses, anyway).
+ */
+
+#define R_Y_OFF 0 /* offset to R => Y section */
+#define G_Y_OFF (1 * (MAXJSAMPLE + 1)) /* offset to G => Y section */
+#define B_Y_OFF (2 * (MAXJSAMPLE + 1)) /* etc. */
+#define R_CB_OFF (3 * (MAXJSAMPLE + 1))
+#define G_CB_OFF (4 * (MAXJSAMPLE + 1))
+#define B_CB_OFF (5 * (MAXJSAMPLE + 1))
+#define R_CR_OFF B_CB_OFF /* B=>Cb, R=>Cr are the same */
+#define G_CR_OFF (6 * (MAXJSAMPLE + 1))
+#define B_CR_OFF (7 * (MAXJSAMPLE + 1))
+#define TABLE_SIZE (8 * (MAXJSAMPLE + 1))
+
+/* 12-bit samples use a 16-bit data type, so it is possible to pass
+ * out-of-range sample values (< 0 or > 4095) to jpeg_write_scanlines().
+ * Thus, we mask the incoming 12-bit samples to guard against overrunning
+ * or underrunning the conversion tables.
+ */
+
+#if BITS_IN_JSAMPLE == 12
+#define RANGE_LIMIT(value) ((value) & 0xFFF)
+#else
+#define RANGE_LIMIT(value) (value)
+#endif
+
+
+/* Include inline routines for colorspace extensions */
+
+#include "jccolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define rgb_ycc_convert_internal extrgb_ycc_convert_internal
+#define rgb_gray_convert_internal extrgb_gray_convert_internal
+#define rgb_rgb_convert_internal extrgb_rgb_convert_internal
+#include "jccolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef rgb_ycc_convert_internal
+#undef rgb_gray_convert_internal
+#undef rgb_rgb_convert_internal
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define rgb_ycc_convert_internal extrgbx_ycc_convert_internal
+#define rgb_gray_convert_internal extrgbx_gray_convert_internal
+#define rgb_rgb_convert_internal extrgbx_rgb_convert_internal
+#include "jccolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef rgb_ycc_convert_internal
+#undef rgb_gray_convert_internal
+#undef rgb_rgb_convert_internal
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define rgb_ycc_convert_internal extbgr_ycc_convert_internal
+#define rgb_gray_convert_internal extbgr_gray_convert_internal
+#define rgb_rgb_convert_internal extbgr_rgb_convert_internal
+#include "jccolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef rgb_ycc_convert_internal
+#undef rgb_gray_convert_internal
+#undef rgb_rgb_convert_internal
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define rgb_ycc_convert_internal extbgrx_ycc_convert_internal
+#define rgb_gray_convert_internal extbgrx_gray_convert_internal
+#define rgb_rgb_convert_internal extbgrx_rgb_convert_internal
+#include "jccolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef rgb_ycc_convert_internal
+#undef rgb_gray_convert_internal
+#undef rgb_rgb_convert_internal
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define rgb_ycc_convert_internal extxbgr_ycc_convert_internal
+#define rgb_gray_convert_internal extxbgr_gray_convert_internal
+#define rgb_rgb_convert_internal extxbgr_rgb_convert_internal
+#include "jccolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef rgb_ycc_convert_internal
+#undef rgb_gray_convert_internal
+#undef rgb_rgb_convert_internal
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define rgb_ycc_convert_internal extxrgb_ycc_convert_internal
+#define rgb_gray_convert_internal extxrgb_gray_convert_internal
+#define rgb_rgb_convert_internal extxrgb_rgb_convert_internal
+#include "jccolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef rgb_ycc_convert_internal
+#undef rgb_gray_convert_internal
+#undef rgb_rgb_convert_internal
+
+
+/*
+ * Initialize for RGB->YCC colorspace conversion.
+ */
+
+METHODDEF(void)
+rgb_ycc_start(j_compress_ptr cinfo)
+{
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
+ JLONG *rgb_ycc_tab;
+ JLONG i;
+
+ /* Allocate and fill in the conversion tables. */
+ cconvert->rgb_ycc_tab = rgb_ycc_tab = (JLONG *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (TABLE_SIZE * sizeof(JLONG)));
+
+ for (i = 0; i <= MAXJSAMPLE; i++) {
+ rgb_ycc_tab[i + R_Y_OFF] = FIX(0.29900) * i;
+ rgb_ycc_tab[i + G_Y_OFF] = FIX(0.58700) * i;
+ rgb_ycc_tab[i + B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
+ rgb_ycc_tab[i + R_CB_OFF] = (-FIX(0.16874)) * i;
+ rgb_ycc_tab[i + G_CB_OFF] = (-FIX(0.33126)) * i;
+ /* We use a rounding fudge-factor of 0.5-epsilon for Cb and Cr.
+ * This ensures that the maximum output will round to MAXJSAMPLE
+ * not MAXJSAMPLE+1, and thus that we don't have to range-limit.
+ */
+ rgb_ycc_tab[i + B_CB_OFF] = FIX(0.50000) * i + CBCR_OFFSET + ONE_HALF - 1;
+/* B=>Cb and R=>Cr tables are the same
+ rgb_ycc_tab[i + R_CR_OFF] = FIX(0.50000) * i + CBCR_OFFSET + ONE_HALF - 1;
+*/
+ rgb_ycc_tab[i + G_CR_OFF] = (-FIX(0.41869)) * i;
+ rgb_ycc_tab[i + B_CR_OFF] = (-FIX(0.08131)) * i;
+ }
+}
+
+
+/*
+ * Convert some rows of samples to the JPEG colorspace.
+ */
+
+METHODDEF(void)
+rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
+{
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ extrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ extrgbx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_BGR:
+ extbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ extbgrx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ extxbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ extxrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ default:
+ rgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ }
+}
+
+
+/**************** Cases other than RGB -> YCbCr **************/
+
+
+/*
+ * Convert some rows of samples to the JPEG colorspace.
+ */
+
+METHODDEF(void)
+rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
+{
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ extrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ extrgbx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_BGR:
+ extbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ extbgrx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ extxbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ extxrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ default:
+ rgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ }
+}
+
+
+/*
+ * Extended RGB to plain RGB conversion
+ */
+
+METHODDEF(void)
+rgb_rgb_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
+{
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ extrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ extrgbx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_BGR:
+ extbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ extbgrx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ extxbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ extxrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ default:
+ rgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+ num_rows);
+ break;
+ }
+}
+
+
+/*
+ * Convert some rows of samples to the JPEG colorspace.
+ * This version handles Adobe-style CMYK->YCCK conversion,
+ * where we convert R=1-C, G=1-M, and B=1-Y to YCbCr using the same
+ * conversion as above, while passing K (black) unchanged.
+ * We assume rgb_ycc_start has been called.
+ */
+
+METHODDEF(void)
+cmyk_ycck_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
+{
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
+ register int r, g, b;
+ register JLONG *ctab = cconvert->rgb_ycc_tab;
+ register JSAMPROW inptr;
+ register JSAMPROW outptr0, outptr1, outptr2, outptr3;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->image_width;
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ outptr3 = output_buf[3][output_row];
+ output_row++;
+ for (col = 0; col < num_cols; col++) {
+ r = MAXJSAMPLE - RANGE_LIMIT(inptr[0]);
+ g = MAXJSAMPLE - RANGE_LIMIT(inptr[1]);
+ b = MAXJSAMPLE - RANGE_LIMIT(inptr[2]);
+ /* K passes through as-is */
+ outptr3[col] = inptr[3];
+ inptr += 4;
+ /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
+ * must be too; we do not need an explicit range-limiting operation.
+ * Hence the value being shifted is never negative, and we don't
+ * need the general RIGHT_SHIFT macro.
+ */
+ /* Y */
+ outptr0[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+ ctab[b + B_Y_OFF]) >> SCALEBITS);
+ /* Cb */
+ outptr1[col] = (JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
+ ctab[b + B_CB_OFF]) >> SCALEBITS);
+ /* Cr */
+ outptr2[col] = (JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
+ ctab[b + B_CR_OFF]) >> SCALEBITS);
+ }
+ }
+}
+
+
+/*
+ * Convert some rows of samples to the JPEG colorspace.
+ * This version handles grayscale output with no conversion.
+ * The source can be either plain grayscale or YCbCr (since Y == gray).
+ */
+
+METHODDEF(void)
+grayscale_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
+{
+ register JSAMPROW inptr;
+ register JSAMPROW outptr;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->image_width;
+ int instride = cinfo->input_components;
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr = output_buf[0][output_row];
+ output_row++;
+ for (col = 0; col < num_cols; col++) {
+ outptr[col] = inptr[0];
+ inptr += instride;
+ }
+ }
+}
+
+
+/*
+ * Convert some rows of samples to the JPEG colorspace.
+ * This version handles multi-component colorspaces without conversion.
+ * We assume input_components == num_components.
+ */
+
+METHODDEF(void)
+null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ register JSAMPROW inptr;
+ register JSAMPROW outptr, outptr0, outptr1, outptr2, outptr3;
+ register JDIMENSION col;
+ register int ci;
+ int nc = cinfo->num_components;
+ JDIMENSION num_cols = cinfo->image_width;
+
+ if (nc == 3) {
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+ for (col = 0; col < num_cols; col++) {
+ outptr0[col] = *inptr++;
+ outptr1[col] = *inptr++;
+ outptr2[col] = *inptr++;
+ }
+ }
+ } else if (nc == 4) {
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ outptr3 = output_buf[3][output_row];
+ output_row++;
+ for (col = 0; col < num_cols; col++) {
+ outptr0[col] = *inptr++;
+ outptr1[col] = *inptr++;
+ outptr2[col] = *inptr++;
+ outptr3[col] = *inptr++;
+ }
+ }
+ } else {
+ while (--num_rows >= 0) {
+ /* It seems fastest to make a separate pass for each component. */
+ for (ci = 0; ci < nc; ci++) {
+ inptr = *input_buf;
+ outptr = output_buf[ci][output_row];
+ for (col = 0; col < num_cols; col++) {
+ outptr[col] = inptr[ci];
+ inptr += nc;
+ }
+ }
+ input_buf++;
+ output_row++;
+ }
+ }
+}
+
+
+/*
+ * Empty method for start_pass.
+ */
+
+METHODDEF(void)
+null_method(j_compress_ptr cinfo)
+{
+ /* no work needed */
+}
+
+
+/*
+ * Module initialization routine for input colorspace conversion.
+ */
+
+GLOBAL(void)
+jinit_color_converter(j_compress_ptr cinfo)
+{
+ my_cconvert_ptr cconvert;
+
+ cconvert = (my_cconvert_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_color_converter));
+ cinfo->cconvert = (struct jpeg_color_converter *)cconvert;
+ /* set start_pass to null method until we find out differently */
+ cconvert->pub.start_pass = null_method;
+
+ /* Make sure input_components agrees with in_color_space */
+ switch (cinfo->in_color_space) {
+ case JCS_GRAYSCALE:
+ if (cinfo->input_components != 1)
+ ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+ break;
+
+ case JCS_RGB:
+ case JCS_EXT_RGB:
+ case JCS_EXT_RGBX:
+ case JCS_EXT_BGR:
+ case JCS_EXT_BGRX:
+ case JCS_EXT_XBGR:
+ case JCS_EXT_XRGB:
+ case JCS_EXT_RGBA:
+ case JCS_EXT_BGRA:
+ case JCS_EXT_ABGR:
+ case JCS_EXT_ARGB:
+ if (cinfo->input_components != rgb_pixelsize[cinfo->in_color_space])
+ ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+ break;
+
+ case JCS_YCbCr:
+ if (cinfo->input_components != 3)
+ ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+ break;
+
+ case JCS_CMYK:
+ case JCS_YCCK:
+ if (cinfo->input_components != 4)
+ ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+ break;
+
+ default: /* JCS_UNKNOWN can be anything */
+ if (cinfo->input_components < 1)
+ ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+ break;
+ }
+
+ /* Check num_components, set conversion method based on requested space */
+ switch (cinfo->jpeg_color_space) {
+ case JCS_GRAYSCALE:
+ if (cinfo->num_components != 1)
+ ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
+ if (cinfo->in_color_space == JCS_GRAYSCALE)
+ cconvert->pub.color_convert = grayscale_convert;
+ else if (cinfo->in_color_space == JCS_RGB ||
+ cinfo->in_color_space == JCS_EXT_RGB ||
+ cinfo->in_color_space == JCS_EXT_RGBX ||
+ cinfo->in_color_space == JCS_EXT_BGR ||
+ cinfo->in_color_space == JCS_EXT_BGRX ||
+ cinfo->in_color_space == JCS_EXT_XBGR ||
+ cinfo->in_color_space == JCS_EXT_XRGB ||
+ cinfo->in_color_space == JCS_EXT_RGBA ||
+ cinfo->in_color_space == JCS_EXT_BGRA ||
+ cinfo->in_color_space == JCS_EXT_ABGR ||
+ cinfo->in_color_space == JCS_EXT_ARGB) {
+ if (jsimd_can_rgb_gray())
+ cconvert->pub.color_convert = jsimd_rgb_gray_convert;
+ else {
+ cconvert->pub.start_pass = rgb_ycc_start;
+ cconvert->pub.color_convert = rgb_gray_convert;
+ }
+ } else if (cinfo->in_color_space == JCS_YCbCr)
+ cconvert->pub.color_convert = grayscale_convert;
+ else
+ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+ break;
+
+ case JCS_RGB:
+ if (cinfo->num_components != 3)
+ ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
+ if (rgb_red[cinfo->in_color_space] == 0 &&
+ rgb_green[cinfo->in_color_space] == 1 &&
+ rgb_blue[cinfo->in_color_space] == 2 &&
+ rgb_pixelsize[cinfo->in_color_space] == 3) {
+#if defined(__mips__)
+ if (jsimd_c_can_null_convert())
+ cconvert->pub.color_convert = jsimd_c_null_convert;
+ else
+#endif
+ cconvert->pub.color_convert = null_convert;
+ } else if (cinfo->in_color_space == JCS_RGB ||
+ cinfo->in_color_space == JCS_EXT_RGB ||
+ cinfo->in_color_space == JCS_EXT_RGBX ||
+ cinfo->in_color_space == JCS_EXT_BGR ||
+ cinfo->in_color_space == JCS_EXT_BGRX ||
+ cinfo->in_color_space == JCS_EXT_XBGR ||
+ cinfo->in_color_space == JCS_EXT_XRGB ||
+ cinfo->in_color_space == JCS_EXT_RGBA ||
+ cinfo->in_color_space == JCS_EXT_BGRA ||
+ cinfo->in_color_space == JCS_EXT_ABGR ||
+ cinfo->in_color_space == JCS_EXT_ARGB)
+ cconvert->pub.color_convert = rgb_rgb_convert;
+ else
+ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+ break;
+
+ case JCS_YCbCr:
+ if (cinfo->num_components != 3)
+ ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
+ if (cinfo->in_color_space == JCS_RGB ||
+ cinfo->in_color_space == JCS_EXT_RGB ||
+ cinfo->in_color_space == JCS_EXT_RGBX ||
+ cinfo->in_color_space == JCS_EXT_BGR ||
+ cinfo->in_color_space == JCS_EXT_BGRX ||
+ cinfo->in_color_space == JCS_EXT_XBGR ||
+ cinfo->in_color_space == JCS_EXT_XRGB ||
+ cinfo->in_color_space == JCS_EXT_RGBA ||
+ cinfo->in_color_space == JCS_EXT_BGRA ||
+ cinfo->in_color_space == JCS_EXT_ABGR ||
+ cinfo->in_color_space == JCS_EXT_ARGB) {
+ if (jsimd_can_rgb_ycc())
+ cconvert->pub.color_convert = jsimd_rgb_ycc_convert;
+ else {
+ cconvert->pub.start_pass = rgb_ycc_start;
+ cconvert->pub.color_convert = rgb_ycc_convert;
+ }
+ } else if (cinfo->in_color_space == JCS_YCbCr) {
+#if defined(__mips__)
+ if (jsimd_c_can_null_convert())
+ cconvert->pub.color_convert = jsimd_c_null_convert;
+ else
+#endif
+ cconvert->pub.color_convert = null_convert;
+ } else
+ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+ break;
+
+ case JCS_CMYK:
+ if (cinfo->num_components != 4)
+ ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
+ if (cinfo->in_color_space == JCS_CMYK) {
+#if defined(__mips__)
+ if (jsimd_c_can_null_convert())
+ cconvert->pub.color_convert = jsimd_c_null_convert;
+ else
+#endif
+ cconvert->pub.color_convert = null_convert;
+ } else
+ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+ break;
+
+ case JCS_YCCK:
+ if (cinfo->num_components != 4)
+ ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
+ if (cinfo->in_color_space == JCS_CMYK) {
+ cconvert->pub.start_pass = rgb_ycc_start;
+ cconvert->pub.color_convert = cmyk_ycck_convert;
+ } else if (cinfo->in_color_space == JCS_YCCK) {
+#if defined(__mips__)
+ if (jsimd_c_can_null_convert())
+ cconvert->pub.color_convert = jsimd_c_null_convert;
+ else
+#endif
+ cconvert->pub.color_convert = null_convert;
+ } else
+ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+ break;
+
+ default: /* allow null conversion of JCS_UNKNOWN */
+ if (cinfo->jpeg_color_space != cinfo->in_color_space ||
+ cinfo->num_components != cinfo->input_components)
+ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+#if defined(__mips__)
+ if (jsimd_c_can_null_convert())
+ cconvert->pub.color_convert = jsimd_c_null_convert;
+ else
+#endif
+ cconvert->pub.color_convert = null_convert;
+ break;
+ }
+}
diff --git a/media/libjpeg/jcdctmgr.c b/media/libjpeg/jcdctmgr.c
new file mode 100644
index 0000000000..7dae17a6e1
--- /dev/null
+++ b/media/libjpeg/jcdctmgr.c
@@ -0,0 +1,720 @@
+/*
+ * jcdctmgr.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the forward-DCT management logic.
+ * This code selects a particular DCT implementation to be used,
+ * and it performs related housekeeping chores including coefficient
+ * quantization.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h" /* Private declarations for DCT subsystem */
+#include "jsimddct.h"
+
+
+/* Private subobject for this module */
+
+typedef void (*forward_DCT_method_ptr) (DCTELEM *data);
+typedef void (*float_DCT_method_ptr) (FAST_FLOAT *data);
+
+typedef void (*convsamp_method_ptr) (JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ DCTELEM *workspace);
+typedef void (*float_convsamp_method_ptr) (JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ FAST_FLOAT *workspace);
+
+typedef void (*quantize_method_ptr) (JCOEFPTR coef_block, DCTELEM *divisors,
+ DCTELEM *workspace);
+typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block,
+ FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace);
+
+METHODDEF(void) quantize(JCOEFPTR, DCTELEM *, DCTELEM *);
+
+typedef struct {
+ struct jpeg_forward_dct pub; /* public fields */
+
+ /* Pointer to the DCT routine actually in use */
+ forward_DCT_method_ptr dct;
+ convsamp_method_ptr convsamp;
+ quantize_method_ptr quantize;
+
+ /* The actual post-DCT divisors --- not identical to the quant table
+ * entries, because of scaling (especially for an unnormalized DCT).
+ * Each table is given in normal array order.
+ */
+ DCTELEM *divisors[NUM_QUANT_TBLS];
+
+ /* work area for FDCT subroutine */
+ DCTELEM *workspace;
+
+#ifdef DCT_FLOAT_SUPPORTED
+ /* Same as above for the floating-point case. */
+ float_DCT_method_ptr float_dct;
+ float_convsamp_method_ptr float_convsamp;
+ float_quantize_method_ptr float_quantize;
+ FAST_FLOAT *float_divisors[NUM_QUANT_TBLS];
+ FAST_FLOAT *float_workspace;
+#endif
+} my_fdct_controller;
+
+typedef my_fdct_controller *my_fdct_ptr;
+
+
+#if BITS_IN_JSAMPLE == 8
+
+/*
+ * Find the highest bit in an integer through binary search.
+ */
+
+LOCAL(int)
+flss(UINT16 val)
+{
+ int bit;
+
+ bit = 16;
+
+ if (!val)
+ return 0;
+
+ if (!(val & 0xff00)) {
+ bit -= 8;
+ val <<= 8;
+ }
+ if (!(val & 0xf000)) {
+ bit -= 4;
+ val <<= 4;
+ }
+ if (!(val & 0xc000)) {
+ bit -= 2;
+ val <<= 2;
+ }
+ if (!(val & 0x8000)) {
+ bit -= 1;
+ val <<= 1;
+ }
+
+ return bit;
+}
+
+
+/*
+ * Compute values to do a division using reciprocal.
+ *
+ * This implementation is based on an algorithm described in
+ * "How to optimize for the Pentium family of microprocessors"
+ * (http://www.agner.org/assem/).
+ * More information about the basic algorithm can be found in
+ * the paper "Integer Division Using Reciprocals" by Robert Alverson.
+ *
+ * The basic idea is to replace x/d by x * d^-1. In order to store
+ * d^-1 with enough precision we shift it left a few places. It turns
+ * out that this algoright gives just enough precision, and also fits
+ * into DCTELEM:
+ *
+ * b = (the number of significant bits in divisor) - 1
+ * r = (word size) + b
+ * f = 2^r / divisor
+ *
+ * f will not be an integer for most cases, so we need to compensate
+ * for the rounding error introduced:
+ *
+ * no fractional part:
+ *
+ * result = input >> r
+ *
+ * fractional part of f < 0.5:
+ *
+ * round f down to nearest integer
+ * result = ((input + 1) * f) >> r
+ *
+ * fractional part of f > 0.5:
+ *
+ * round f up to nearest integer
+ * result = (input * f) >> r
+ *
+ * This is the original algorithm that gives truncated results. But we
+ * want properly rounded results, so we replace "input" with
+ * "input + divisor/2".
+ *
+ * In order to allow SIMD implementations we also tweak the values to
+ * allow the same calculation to be made at all times:
+ *
+ * dctbl[0] = f rounded to nearest integer
+ * dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
+ * dctbl[2] = 1 << ((word size) * 2 - r)
+ * dctbl[3] = r - (word size)
+ *
+ * dctbl[2] is for stupid instruction sets where the shift operation
+ * isn't member wise (e.g. MMX).
+ *
+ * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
+ * is that most SIMD implementations have a "multiply and store top
+ * half" operation.
+ *
+ * Lastly, we store each of the values in their own table instead
+ * of in a consecutive manner, yet again in order to allow SIMD
+ * routines.
+ */
+
+LOCAL(int)
+compute_reciprocal(UINT16 divisor, DCTELEM *dtbl)
+{
+ UDCTELEM2 fq, fr;
+ UDCTELEM c;
+ int b, r;
+
+ if (divisor == 1) {
+ /* divisor == 1 means unquantized, so these reciprocal/correction/shift
+ * values will cause the C quantization algorithm to act like the
+ * identity function. Since only the C quantization algorithm is used in
+ * these cases, the scale value is irrelevant.
+ */
+ dtbl[DCTSIZE2 * 0] = (DCTELEM)1; /* reciprocal */
+ dtbl[DCTSIZE2 * 1] = (DCTELEM)0; /* correction */
+ dtbl[DCTSIZE2 * 2] = (DCTELEM)1; /* scale */
+ dtbl[DCTSIZE2 * 3] = -(DCTELEM)(sizeof(DCTELEM) * 8); /* shift */
+ return 0;
+ }
+
+ b = flss(divisor) - 1;
+ r = sizeof(DCTELEM) * 8 + b;
+
+ fq = ((UDCTELEM2)1 << r) / divisor;
+ fr = ((UDCTELEM2)1 << r) % divisor;
+
+ c = divisor / 2; /* for rounding */
+
+ if (fr == 0) { /* divisor is power of two */
+ /* fq will be one bit too large to fit in DCTELEM, so adjust */
+ fq >>= 1;
+ r--;
+ } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
+ c++;
+ } else { /* fractional part is > 0.5 */
+ fq++;
+ }
+
+ dtbl[DCTSIZE2 * 0] = (DCTELEM)fq; /* reciprocal */
+ dtbl[DCTSIZE2 * 1] = (DCTELEM)c; /* correction + roundfactor */
+#ifdef WITH_SIMD
+ dtbl[DCTSIZE2 * 2] = (DCTELEM)(1 << (sizeof(DCTELEM) * 8 * 2 - r)); /* scale */
+#else
+ dtbl[DCTSIZE2 * 2] = 1;
+#endif
+ dtbl[DCTSIZE2 * 3] = (DCTELEM)r - sizeof(DCTELEM) * 8; /* shift */
+
+ if (r <= 16) return 0;
+ else return 1;
+}
+
+#endif
+
+
+/*
+ * Initialize for a processing pass.
+ * Verify that all referenced Q-tables are present, and set up
+ * the divisor table for each one.
+ * In the current implementation, DCT of all components is done during
+ * the first pass, even if only some components will be output in the
+ * first scan. Hence all components should be examined here.
+ */
+
+METHODDEF(void)
+start_pass_fdctmgr(j_compress_ptr cinfo)
+{
+ my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
+ int ci, qtblno, i;
+ jpeg_component_info *compptr;
+ JQUANT_TBL *qtbl;
+ DCTELEM *dtbl;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ qtblno = compptr->quant_tbl_no;
+ /* Make sure specified quantization table is present */
+ if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
+ cinfo->quant_tbl_ptrs[qtblno] == NULL)
+ ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
+ qtbl = cinfo->quant_tbl_ptrs[qtblno];
+ /* Compute divisors for this quant table */
+ /* We may do this more than once for same table, but it's not a big deal */
+ switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+ case JDCT_ISLOW:
+ /* For LL&M IDCT method, divisors are equal to raw quantization
+ * coefficients multiplied by 8 (to counteract scaling).
+ */
+ if (fdct->divisors[qtblno] == NULL) {
+ fdct->divisors[qtblno] = (DCTELEM *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (DCTSIZE2 * 4) * sizeof(DCTELEM));
+ }
+ dtbl = fdct->divisors[qtblno];
+ for (i = 0; i < DCTSIZE2; i++) {
+#if BITS_IN_JSAMPLE == 8
+ if (!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) &&
+ fdct->quantize == jsimd_quantize)
+ fdct->quantize = quantize;
+#else
+ dtbl[i] = ((DCTELEM)qtbl->quantval[i]) << 3;
+#endif
+ }
+ break;
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+ case JDCT_IFAST:
+ {
+ /* For AA&N IDCT method, divisors are equal to quantization
+ * coefficients scaled by scalefactor[row]*scalefactor[col], where
+ * scalefactor[0] = 1
+ * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
+ * We apply a further scale factor of 8.
+ */
+#define CONST_BITS 14
+ static const INT16 aanscales[DCTSIZE2] = {
+ /* precomputed values scaled up by 14 bits */
+ 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
+ 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
+ 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
+ 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
+ 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
+ 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552,
+ 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446,
+ 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247
+ };
+ SHIFT_TEMPS
+
+ if (fdct->divisors[qtblno] == NULL) {
+ fdct->divisors[qtblno] = (DCTELEM *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (DCTSIZE2 * 4) * sizeof(DCTELEM));
+ }
+ dtbl = fdct->divisors[qtblno];
+ for (i = 0; i < DCTSIZE2; i++) {
+#if BITS_IN_JSAMPLE == 8
+ if (!compute_reciprocal(
+ DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+ (JLONG)aanscales[i]),
+ CONST_BITS - 3), &dtbl[i]) &&
+ fdct->quantize == jsimd_quantize)
+ fdct->quantize = quantize;
+#else
+ dtbl[i] = (DCTELEM)
+ DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+ (JLONG)aanscales[i]),
+ CONST_BITS - 3);
+#endif
+ }
+ }
+ break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+ case JDCT_FLOAT:
+ {
+ /* For float AA&N IDCT method, divisors are equal to quantization
+ * coefficients scaled by scalefactor[row]*scalefactor[col], where
+ * scalefactor[0] = 1
+ * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
+ * We apply a further scale factor of 8.
+ * What's actually stored is 1/divisor so that the inner loop can
+ * use a multiplication rather than a division.
+ */
+ FAST_FLOAT *fdtbl;
+ int row, col;
+ static const double aanscalefactor[DCTSIZE] = {
+ 1.0, 1.387039845, 1.306562965, 1.175875602,
+ 1.0, 0.785694958, 0.541196100, 0.275899379
+ };
+
+ if (fdct->float_divisors[qtblno] == NULL) {
+ fdct->float_divisors[qtblno] = (FAST_FLOAT *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ DCTSIZE2 * sizeof(FAST_FLOAT));
+ }
+ fdtbl = fdct->float_divisors[qtblno];
+ i = 0;
+ for (row = 0; row < DCTSIZE; row++) {
+ for (col = 0; col < DCTSIZE; col++) {
+ fdtbl[i] = (FAST_FLOAT)
+ (1.0 / (((double)qtbl->quantval[i] *
+ aanscalefactor[row] * aanscalefactor[col] * 8.0)));
+ i++;
+ }
+ }
+ }
+ break;
+#endif
+ default:
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+ break;
+ }
+ }
+}
+
+
+/*
+ * Load data into workspace, applying unsigned->signed conversion.
+ */
+
+METHODDEF(void)
+convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
+{
+ register DCTELEM *workspaceptr;
+ register JSAMPROW elemptr;
+ register int elemr;
+
+ workspaceptr = workspace;
+ for (elemr = 0; elemr < DCTSIZE; elemr++) {
+ elemptr = sample_data[elemr] + start_col;
+
+#if DCTSIZE == 8 /* unroll the inner loop */
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+#else
+ {
+ register int elemc;
+ for (elemc = DCTSIZE; elemc > 0; elemc--)
+ *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+ }
+#endif
+ }
+}
+
+
+/*
+ * Quantize/descale the coefficients, and store into coef_blocks[].
+ */
+
+METHODDEF(void)
+quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ int i;
+ DCTELEM temp;
+ JCOEFPTR output_ptr = coef_block;
+
+#if BITS_IN_JSAMPLE == 8
+
+ UDCTELEM recip, corr;
+ int shift;
+ UDCTELEM2 product;
+
+ for (i = 0; i < DCTSIZE2; i++) {
+ temp = workspace[i];
+ recip = divisors[i + DCTSIZE2 * 0];
+ corr = divisors[i + DCTSIZE2 * 1];
+ shift = divisors[i + DCTSIZE2 * 3];
+
+ if (temp < 0) {
+ temp = -temp;
+ product = (UDCTELEM2)(temp + corr) * recip;
+ product >>= shift + sizeof(DCTELEM) * 8;
+ temp = (DCTELEM)product;
+ temp = -temp;
+ } else {
+ product = (UDCTELEM2)(temp + corr) * recip;
+ product >>= shift + sizeof(DCTELEM) * 8;
+ temp = (DCTELEM)product;
+ }
+ output_ptr[i] = (JCOEF)temp;
+ }
+
+#else
+
+ register DCTELEM qval;
+
+ for (i = 0; i < DCTSIZE2; i++) {
+ qval = divisors[i];
+ temp = workspace[i];
+ /* Divide the coefficient value by qval, ensuring proper rounding.
+ * Since C does not specify the direction of rounding for negative
+ * quotients, we have to force the dividend positive for portability.
+ *
+ * In most files, at least half of the output values will be zero
+ * (at default quantization settings, more like three-quarters...)
+ * so we should ensure that this case is fast. On many machines,
+ * a comparison is enough cheaper than a divide to make a special test
+ * a win. Since both inputs will be nonnegative, we need only test
+ * for a < b to discover whether a/b is 0.
+ * If your machine's division is fast enough, define FAST_DIVIDE.
+ */
+#ifdef FAST_DIVIDE
+#define DIVIDE_BY(a, b) a /= b
+#else
+#define DIVIDE_BY(a, b) if (a >= b) a /= b; else a = 0
+#endif
+ if (temp < 0) {
+ temp = -temp;
+ temp += qval >> 1; /* for rounding */
+ DIVIDE_BY(temp, qval);
+ temp = -temp;
+ } else {
+ temp += qval >> 1; /* for rounding */
+ DIVIDE_BY(temp, qval);
+ }
+ output_ptr[i] = (JCOEF)temp;
+ }
+
+#endif
+
+}
+
+
+/*
+ * Perform forward DCT on one or more blocks of a component.
+ *
+ * The input samples are taken from the sample_data[] array starting at
+ * position start_row/start_col, and moving to the right for any additional
+ * blocks. The quantized coefficients are returned in coef_blocks[].
+ */
+
+METHODDEF(void)
+forward_DCT(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+ JDIMENSION start_row, JDIMENSION start_col, JDIMENSION num_blocks)
+/* This version is used for integer DCT implementations. */
+{
+ /* This routine is heavily used, so it's worth coding it tightly. */
+ my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
+ DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no];
+ DCTELEM *workspace;
+ JDIMENSION bi;
+
+ /* Make sure the compiler doesn't look up these every pass */
+ forward_DCT_method_ptr do_dct = fdct->dct;
+ convsamp_method_ptr do_convsamp = fdct->convsamp;
+ quantize_method_ptr do_quantize = fdct->quantize;
+ workspace = fdct->workspace;
+
+ sample_data += start_row; /* fold in the vertical offset once */
+
+ for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
+ /* Load data into workspace, applying unsigned->signed conversion */
+ (*do_convsamp) (sample_data, start_col, workspace);
+
+ /* Perform the DCT */
+ (*do_dct) (workspace);
+
+ /* Quantize/descale the coefficients, and store into coef_blocks[] */
+ (*do_quantize) (coef_blocks[bi], divisors, workspace);
+ }
+}
+
+
+#ifdef DCT_FLOAT_SUPPORTED
+
+METHODDEF(void)
+convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+ register FAST_FLOAT *workspaceptr;
+ register JSAMPROW elemptr;
+ register int elemr;
+
+ workspaceptr = workspace;
+ for (elemr = 0; elemr < DCTSIZE; elemr++) {
+ elemptr = sample_data[elemr] + start_col;
+#if DCTSIZE == 8 /* unroll the inner loop */
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+#else
+ {
+ register int elemc;
+ for (elemc = DCTSIZE; elemc > 0; elemc--)
+ *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+ }
+#endif
+ }
+}
+
+
+METHODDEF(void)
+quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+ register FAST_FLOAT temp;
+ register int i;
+ register JCOEFPTR output_ptr = coef_block;
+
+ for (i = 0; i < DCTSIZE2; i++) {
+ /* Apply the quantization and scaling factor */
+ temp = workspace[i] * divisors[i];
+
+ /* Round to nearest integer.
+ * Since C does not specify the direction of rounding for negative
+ * quotients, we have to force the dividend positive for portability.
+ * The maximum coefficient size is +-16K (for 12-bit data), so this
+ * code should work for either 16-bit or 32-bit ints.
+ */
+ output_ptr[i] = (JCOEF)((int)(temp + (FAST_FLOAT)16384.5) - 16384);
+ }
+}
+
+
+METHODDEF(void)
+forward_DCT_float(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+ JDIMENSION start_row, JDIMENSION start_col,
+ JDIMENSION num_blocks)
+/* This version is used for floating-point DCT implementations. */
+{
+ /* This routine is heavily used, so it's worth coding it tightly. */
+ my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
+ FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no];
+ FAST_FLOAT *workspace;
+ JDIMENSION bi;
+
+
+ /* Make sure the compiler doesn't look up these every pass */
+ float_DCT_method_ptr do_dct = fdct->float_dct;
+ float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
+ float_quantize_method_ptr do_quantize = fdct->float_quantize;
+ workspace = fdct->float_workspace;
+
+ sample_data += start_row; /* fold in the vertical offset once */
+
+ for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
+ /* Load data into workspace, applying unsigned->signed conversion */
+ (*do_convsamp) (sample_data, start_col, workspace);
+
+ /* Perform the DCT */
+ (*do_dct) (workspace);
+
+ /* Quantize/descale the coefficients, and store into coef_blocks[] */
+ (*do_quantize) (coef_blocks[bi], divisors, workspace);
+ }
+}
+
+#endif /* DCT_FLOAT_SUPPORTED */
+
+
+/*
+ * Initialize FDCT manager.
+ */
+
+GLOBAL(void)
+jinit_forward_dct(j_compress_ptr cinfo)
+{
+ my_fdct_ptr fdct;
+ int i;
+
+ fdct = (my_fdct_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_fdct_controller));
+ cinfo->fdct = (struct jpeg_forward_dct *)fdct;
+ fdct->pub.start_pass = start_pass_fdctmgr;
+
+ /* First determine the DCT... */
+ switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+ case JDCT_ISLOW:
+ fdct->pub.forward_DCT = forward_DCT;
+ if (jsimd_can_fdct_islow())
+ fdct->dct = jsimd_fdct_islow;
+ else
+ fdct->dct = jpeg_fdct_islow;
+ break;
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+ case JDCT_IFAST:
+ fdct->pub.forward_DCT = forward_DCT;
+ if (jsimd_can_fdct_ifast())
+ fdct->dct = jsimd_fdct_ifast;
+ else
+ fdct->dct = jpeg_fdct_ifast;
+ break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+ case JDCT_FLOAT:
+ fdct->pub.forward_DCT = forward_DCT_float;
+ if (jsimd_can_fdct_float())
+ fdct->float_dct = jsimd_fdct_float;
+ else
+ fdct->float_dct = jpeg_fdct_float;
+ break;
+#endif
+ default:
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+ break;
+ }
+
+ /* ...then the supporting stages. */
+ switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+ case JDCT_ISLOW:
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+ case JDCT_IFAST:
+#endif
+#if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
+ if (jsimd_can_convsamp())
+ fdct->convsamp = jsimd_convsamp;
+ else
+ fdct->convsamp = convsamp;
+ if (jsimd_can_quantize())
+ fdct->quantize = jsimd_quantize;
+ else
+ fdct->quantize = quantize;
+ break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+ case JDCT_FLOAT:
+ if (jsimd_can_convsamp_float())
+ fdct->float_convsamp = jsimd_convsamp_float;
+ else
+ fdct->float_convsamp = convsamp_float;
+ if (jsimd_can_quantize_float())
+ fdct->float_quantize = jsimd_quantize_float;
+ else
+ fdct->float_quantize = quantize_float;
+ break;
+#endif
+ default:
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+ break;
+ }
+
+ /* Allocate workspace memory */
+#ifdef DCT_FLOAT_SUPPORTED
+ if (cinfo->dct_method == JDCT_FLOAT)
+ fdct->float_workspace = (FAST_FLOAT *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(FAST_FLOAT) * DCTSIZE2);
+ else
+#endif
+ fdct->workspace = (DCTELEM *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(DCTELEM) * DCTSIZE2);
+
+ /* Mark divisor tables unallocated */
+ for (i = 0; i < NUM_QUANT_TBLS; i++) {
+ fdct->divisors[i] = NULL;
+#ifdef DCT_FLOAT_SUPPORTED
+ fdct->float_divisors[i] = NULL;
+#endif
+ }
+}
diff --git a/media/libjpeg/jchuff.c b/media/libjpeg/jchuff.c
new file mode 100644
index 0000000000..5d0276ad25
--- /dev/null
+++ b/media/libjpeg/jchuff.c
@@ -0,0 +1,1136 @@
+/*
+ * jchuff.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009-2011, 2014-2016, 2018-2022, D. R. Commander.
+ * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains Huffman entropy encoding routines.
+ *
+ * Much of the complexity here has to do with supporting output suspension.
+ * If the data destination module demands suspension, we want to be able to
+ * back up to the start of the current MCU. To do this, we copy state
+ * variables into local working storage, and update them back to the
+ * permanent JPEG objects only upon successful completion of an MCU.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jsimd.h"
+#include <limits.h>
+
+/*
+ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
+ * used for bit counting rather than the lookup table. This will reduce the
+ * memory footprint by 64k, which is important for some mobile applications
+ * that create many isolated instances of libjpeg-turbo (web browsers, for
+ * instance.) This may improve performance on some mobile platforms as well.
+ * This feature is enabled by default only on Arm processors, because some x86
+ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
+ * shown to have a significant performance impact even on the x86 chips that
+ * have a fast implementation of it. When building for Armv6, you can
+ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
+ * flags (this defines __thumb__).
+ */
+
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+ defined(_M_ARM) || defined(_M_ARM64)
+#if !defined(__thumb__) || defined(__thumb2__)
+#define USE_CLZ_INTRINSIC
+#endif
+#endif
+
+#ifdef USE_CLZ_INTRINSIC
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x) (32 - _CountLeadingZeros(x))
+#else
+#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
+#endif
+#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
+#else
+#include "jpeg_nbits_table.h"
+#define JPEG_NBITS(x) (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
+#endif
+
+
+/* Expanded entropy encoder object for Huffman encoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+#if defined(__x86_64__) && defined(__ILP32__)
+typedef unsigned long long bit_buf_type;
+#else
+typedef size_t bit_buf_type;
+#endif
+
+/* NOTE: The more optimal Huffman encoding algorithm is only used by the
+ * intrinsics implementation of the Arm Neon SIMD extensions, which is why we
+ * retain the old Huffman encoder behavior when using the GAS implementation.
+ */
+#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__) || \
+ defined(_M_ARM) || defined(_M_ARM64))
+typedef unsigned long long simd_bit_buf_type;
+#else
+typedef bit_buf_type simd_bit_buf_type;
+#endif
+
+#if (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 8) || defined(_WIN64) || \
+ (defined(__x86_64__) && defined(__ILP32__))
+#define BIT_BUF_SIZE 64
+#elif (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 4) || defined(_WIN32)
+#define BIT_BUF_SIZE 32
+#else
+#error Cannot determine word size
+#endif
+#define SIMD_BIT_BUF_SIZE (sizeof(simd_bit_buf_type) * 8)
+
+typedef struct {
+ union {
+ bit_buf_type c;
+ simd_bit_buf_type simd;
+ } put_buffer; /* current bit accumulation buffer */
+ int free_bits; /* # of bits available in it */
+ /* (Neon GAS: # of bits now in it) */
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+ struct jpeg_entropy_encoder pub; /* public fields */
+
+ savable_state saved; /* Bit buffer & DC state at start of MCU */
+
+ /* These fields are NOT loaded into local working state. */
+ unsigned int restarts_to_go; /* MCUs left in this restart interval */
+ int next_restart_num; /* next restart number to write (0-7) */
+
+ /* Pointers to derived tables (these workspaces have image lifespan) */
+ c_derived_tbl *dc_derived_tbls[NUM_HUFF_TBLS];
+ c_derived_tbl *ac_derived_tbls[NUM_HUFF_TBLS];
+
+#ifdef ENTROPY_OPT_SUPPORTED /* Statistics tables for optimization */
+ long *dc_count_ptrs[NUM_HUFF_TBLS];
+ long *ac_count_ptrs[NUM_HUFF_TBLS];
+#endif
+
+ int simd;
+} huff_entropy_encoder;
+
+typedef huff_entropy_encoder *huff_entropy_ptr;
+
+/* Working state while writing an MCU.
+ * This struct contains all the fields that are needed by subroutines.
+ */
+
+typedef struct {
+ JOCTET *next_output_byte; /* => next byte to write in buffer */
+ size_t free_in_buffer; /* # of byte spaces remaining in buffer */
+ savable_state cur; /* Current bit buffer & DC state */
+ j_compress_ptr cinfo; /* dump_buffer needs access to this */
+ int simd;
+} working_state;
+
+
+/* Forward declarations */
+METHODDEF(boolean) encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_huff(j_compress_ptr cinfo);
+#ifdef ENTROPY_OPT_SUPPORTED
+METHODDEF(boolean) encode_mcu_gather(j_compress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_gather(j_compress_ptr cinfo);
+#endif
+
+
+/*
+ * Initialize for a Huffman-compressed scan.
+ * If gather_statistics is TRUE, we do not output anything during the scan,
+ * just count the Huffman symbols used and generate Huffman code tables.
+ */
+
+METHODDEF(void)
+start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
+ int ci, dctbl, actbl;
+ jpeg_component_info *compptr;
+
+ if (gather_statistics) {
+#ifdef ENTROPY_OPT_SUPPORTED
+ entropy->pub.encode_mcu = encode_mcu_gather;
+ entropy->pub.finish_pass = finish_pass_gather;
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+ } else {
+ entropy->pub.encode_mcu = encode_mcu_huff;
+ entropy->pub.finish_pass = finish_pass_huff;
+ }
+
+ entropy->simd = jsimd_can_huff_encode_one_block();
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ dctbl = compptr->dc_tbl_no;
+ actbl = compptr->ac_tbl_no;
+ if (gather_statistics) {
+#ifdef ENTROPY_OPT_SUPPORTED
+ /* Check for invalid table indexes */
+ /* (make_c_derived_tbl does this in the other path) */
+ if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS)
+ ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);
+ if (actbl < 0 || actbl >= NUM_HUFF_TBLS)
+ ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl);
+ /* Allocate and zero the statistics tables */
+ /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
+ if (entropy->dc_count_ptrs[dctbl] == NULL)
+ entropy->dc_count_ptrs[dctbl] = (long *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ 257 * sizeof(long));
+ memset(entropy->dc_count_ptrs[dctbl], 0, 257 * sizeof(long));
+ if (entropy->ac_count_ptrs[actbl] == NULL)
+ entropy->ac_count_ptrs[actbl] = (long *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ 257 * sizeof(long));
+ memset(entropy->ac_count_ptrs[actbl], 0, 257 * sizeof(long));
+#endif
+ } else {
+ /* Compute derived values for Huffman tables */
+ /* We may do this more than once for a table, but it's not expensive */
+ jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
+ &entropy->dc_derived_tbls[dctbl]);
+ jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,
+ &entropy->ac_derived_tbls[actbl]);
+ }
+ /* Initialize DC predictions to 0 */
+ entropy->saved.last_dc_val[ci] = 0;
+ }
+
+ /* Initialize bit buffer to empty */
+ if (entropy->simd) {
+ entropy->saved.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+ entropy->saved.free_bits = 0;
+#else
+ entropy->saved.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+ } else {
+ entropy->saved.put_buffer.c = 0;
+ entropy->saved.free_bits = BIT_BUF_SIZE;
+ }
+
+ /* Initialize restart stuff */
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num = 0;
+}
+
+
+/*
+ * Compute the derived values for a Huffman table.
+ * This routine also performs some validation checks on the table.
+ *
+ * Note this is also used by jcphuff.c.
+ */
+
+GLOBAL(void)
+jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno,
+ c_derived_tbl **pdtbl)
+{
+ JHUFF_TBL *htbl;
+ c_derived_tbl *dtbl;
+ int p, i, l, lastp, si, maxsymbol;
+ char huffsize[257];
+ unsigned int huffcode[257];
+ unsigned int code;
+
+ /* Note that huffsize[] and huffcode[] are filled in code-length order,
+ * paralleling the order of the symbols themselves in htbl->huffval[].
+ */
+
+ /* Find the input Huffman table */
+ if (tblno < 0 || tblno >= NUM_HUFF_TBLS)
+ ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tblno);
+ htbl =
+ isDC ? cinfo->dc_huff_tbl_ptrs[tblno] : cinfo->ac_huff_tbl_ptrs[tblno];
+ if (htbl == NULL)
+ ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tblno);
+
+ /* Allocate a workspace if we haven't already done so. */
+ if (*pdtbl == NULL)
+ *pdtbl = (c_derived_tbl *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(c_derived_tbl));
+ dtbl = *pdtbl;
+
+ /* Figure C.1: make table of Huffman code length for each symbol */
+
+ p = 0;
+ for (l = 1; l <= 16; l++) {
+ i = (int)htbl->bits[l];
+ if (i < 0 || p + i > 256) /* protect against table overrun */
+ ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
+ while (i--)
+ huffsize[p++] = (char)l;
+ }
+ huffsize[p] = 0;
+ lastp = p;
+
+ /* Figure C.2: generate the codes themselves */
+ /* We also validate that the counts represent a legal Huffman code tree. */
+
+ code = 0;
+ si = huffsize[0];
+ p = 0;
+ while (huffsize[p]) {
+ while (((int)huffsize[p]) == si) {
+ huffcode[p++] = code;
+ code++;
+ }
+ /* code is now 1 more than the last code used for codelength si; but
+ * it must still fit in si bits, since no code is allowed to be all ones.
+ */
+ if (((JLONG)code) >= (((JLONG)1) << si))
+ ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
+ code <<= 1;
+ si++;
+ }
+
+ /* Figure C.3: generate encoding tables */
+ /* These are code and size indexed by symbol value */
+
+ /* Set all codeless symbols to have code length 0;
+ * this lets us detect duplicate VAL entries here, and later
+ * allows emit_bits to detect any attempt to emit such symbols.
+ */
+ memset(dtbl->ehufco, 0, sizeof(dtbl->ehufco));
+ memset(dtbl->ehufsi, 0, sizeof(dtbl->ehufsi));
+
+ /* This is also a convenient place to check for out-of-range
+ * and duplicated VAL entries. We allow 0..255 for AC symbols
+ * but only 0..15 for DC. (We could constrain them further
+ * based on data depth and mode, but this seems enough.)
+ */
+ maxsymbol = isDC ? 15 : 255;
+
+ for (p = 0; p < lastp; p++) {
+ i = htbl->huffval[p];
+ if (i < 0 || i > maxsymbol || dtbl->ehufsi[i])
+ ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
+ dtbl->ehufco[i] = huffcode[p];
+ dtbl->ehufsi[i] = huffsize[p];
+ }
+}
+
+
+/* Outputting bytes to the file */
+
+/* Emit a byte, taking 'action' if must suspend. */
+#define emit_byte(state, val, action) { \
+ *(state)->next_output_byte++ = (JOCTET)(val); \
+ if (--(state)->free_in_buffer == 0) \
+ if (!dump_buffer(state)) \
+ { action; } \
+}
+
+
+LOCAL(boolean)
+dump_buffer(working_state *state)
+/* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
+{
+ struct jpeg_destination_mgr *dest = state->cinfo->dest;
+
+ if (!(*dest->empty_output_buffer) (state->cinfo))
+ return FALSE;
+ /* After a successful buffer dump, must reset buffer pointers */
+ state->next_output_byte = dest->next_output_byte;
+ state->free_in_buffer = dest->free_in_buffer;
+ return TRUE;
+}
+
+
+/* Outputting bits to the file */
+
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be
+ * encoded as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the
+ * byte is 0xFF. Otherwise, the output buffer pointer is advanced by 1, and
+ * the speculative 0 byte will be overwritten by the next byte.
+ */
+#define EMIT_BYTE(b) { \
+ buffer[0] = (JOCTET)(b); \
+ buffer[1] = 0; \
+ buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write
+ * directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if BIT_BUF_SIZE == 64
+
+#define FLUSH() { \
+ if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+ EMIT_BYTE(put_buffer >> 56) \
+ EMIT_BYTE(put_buffer >> 48) \
+ EMIT_BYTE(put_buffer >> 40) \
+ EMIT_BYTE(put_buffer >> 32) \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ buffer[0] = (JOCTET)(put_buffer >> 56); \
+ buffer[1] = (JOCTET)(put_buffer >> 48); \
+ buffer[2] = (JOCTET)(put_buffer >> 40); \
+ buffer[3] = (JOCTET)(put_buffer >> 32); \
+ buffer[4] = (JOCTET)(put_buffer >> 24); \
+ buffer[5] = (JOCTET)(put_buffer >> 16); \
+ buffer[6] = (JOCTET)(put_buffer >> 8); \
+ buffer[7] = (JOCTET)(put_buffer); \
+ buffer += 8; \
+ } \
+}
+
+#else
+
+#define FLUSH() { \
+ if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ buffer[0] = (JOCTET)(put_buffer >> 24); \
+ buffer[1] = (JOCTET)(put_buffer >> 16); \
+ buffer[2] = (JOCTET)(put_buffer >> 8); \
+ buffer[3] = (JOCTET)(put_buffer); \
+ buffer += 4; \
+ } \
+}
+
+#endif
+
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+ put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+ FLUSH() \
+ free_bits += BIT_BUF_SIZE; \
+ put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+ free_bits -= size; \
+ if (free_bits < 0) \
+ PUT_AND_FLUSH(code, size) \
+ else \
+ put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size) { \
+ temp &= (((JLONG)1) << nbits) - 1; \
+ temp |= code << nbits; \
+ nbits += size; \
+ PUT_BITS(temp, nbits) \
+}
+
+
+/* Although it is exceedingly rare, it is possible for a Huffman-encoded
+ * coefficient block to be larger than the 128-byte unencoded block. For each
+ * of the 64 coefficients, PUT_BITS is invoked twice, and each invocation can
+ * theoretically store 16 bits (for a maximum of 2048 bits or 256 bytes per
+ * encoded block.) If, for instance, one artificially sets the AC
+ * coefficients to alternating values of 32767 and -32768 (using the JPEG
+ * scanning order-- 1, 8, 16, etc.), then this will produce an encoded block
+ * larger than 200 bytes.
+ */
+#define BUFSIZE (DCTSIZE2 * 8)
+
+#define LOAD_BUFFER() { \
+ if (state->free_in_buffer < BUFSIZE) { \
+ localbuf = 1; \
+ buffer = _buffer; \
+ } else \
+ buffer = state->next_output_byte; \
+}
+
+#define STORE_BUFFER() { \
+ if (localbuf) { \
+ size_t bytes, bytestocopy; \
+ bytes = buffer - _buffer; \
+ buffer = _buffer; \
+ while (bytes > 0) { \
+ bytestocopy = MIN(bytes, state->free_in_buffer); \
+ memcpy(state->next_output_byte, buffer, bytestocopy); \
+ state->next_output_byte += bytestocopy; \
+ buffer += bytestocopy; \
+ state->free_in_buffer -= bytestocopy; \
+ if (state->free_in_buffer == 0) \
+ if (!dump_buffer(state)) return FALSE; \
+ bytes -= bytestocopy; \
+ } \
+ } else { \
+ state->free_in_buffer -= (buffer - state->next_output_byte); \
+ state->next_output_byte = buffer; \
+ } \
+}
+
+
+LOCAL(boolean)
+flush_bits(working_state *state)
+{
+ JOCTET _buffer[BUFSIZE], *buffer, temp;
+ simd_bit_buf_type put_buffer; int put_bits;
+ int localbuf = 0;
+
+ if (state->simd) {
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+ put_bits = state->cur.free_bits;
+#else
+ put_bits = SIMD_BIT_BUF_SIZE - state->cur.free_bits;
+#endif
+ put_buffer = state->cur.put_buffer.simd;
+ } else {
+ put_bits = BIT_BUF_SIZE - state->cur.free_bits;
+ put_buffer = state->cur.put_buffer.c;
+ }
+
+ LOAD_BUFFER()
+
+ while (put_bits >= 8) {
+ put_bits -= 8;
+ temp = (JOCTET)(put_buffer >> put_bits);
+ EMIT_BYTE(temp)
+ }
+ if (put_bits) {
+ /* fill partial byte with ones */
+ temp = (JOCTET)((put_buffer << (8 - put_bits)) | (0xFF >> put_bits));
+ EMIT_BYTE(temp)
+ }
+
+ if (state->simd) { /* and reset bit buffer to empty */
+ state->cur.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+ state->cur.free_bits = 0;
+#else
+ state->cur.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+ } else {
+ state->cur.put_buffer.c = 0;
+ state->cur.free_bits = BIT_BUF_SIZE;
+ }
+ STORE_BUFFER()
+
+ return TRUE;
+}
+
+
+/* Encode a single block's worth of coefficients */
+
+LOCAL(boolean)
+encode_one_block_simd(working_state *state, JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl, c_derived_tbl *actbl)
+{
+ JOCTET _buffer[BUFSIZE], *buffer;
+ int localbuf = 0;
+
+ LOAD_BUFFER()
+
+ buffer = jsimd_huff_encode_one_block(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+
+ STORE_BUFFER()
+
+ return TRUE;
+}
+
+LOCAL(boolean)
+encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl, c_derived_tbl *actbl)
+{
+ int temp, nbits, free_bits;
+ bit_buf_type put_buffer;
+ JOCTET _buffer[BUFSIZE], *buffer;
+ int localbuf = 0;
+
+ free_bits = state->cur.free_bits;
+ put_buffer = state->cur.put_buffer.c;
+ LOAD_BUFFER()
+
+ /* Encode the DC coefficient difference per section F.1.2.1 */
+
+ temp = block[0] - last_dc_val;
+
+ /* This is a well-known technique for obtaining the absolute value without a
+ * branch. It is derived from an assembly language technique presented in
+ * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
+ * Agner Fog. This code assumes we are on a two's complement machine.
+ */
+ nbits = temp >> (CHAR_BIT * sizeof(int) - 1);
+ temp += nbits;
+ nbits ^= temp;
+
+ /* Find the number of bits needed for the magnitude of the coefficient */
+ nbits = JPEG_NBITS(nbits);
+
+ /* Emit the Huffman-coded symbol for the number of bits.
+ * Emit that number of bits of the value, if positive,
+ * or the complement of its magnitude, if negative.
+ */
+ PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits])
+
+ /* Encode the AC coefficients per section F.1.2.2 */
+
+ {
+ int r = 0; /* r = run length of zeros */
+
+/* Manually unroll the k loop to eliminate the counter variable. This
+ * improves performance greatly on systems with a limited number of
+ * registers (such as x86.)
+ */
+#define kloop(jpeg_natural_order_of_k) { \
+ if ((temp = block[jpeg_natural_order_of_k]) == 0) { \
+ r += 16; \
+ } else { \
+ /* Branch-less absolute value, bitwise complement, etc., same as above */ \
+ nbits = temp >> (CHAR_BIT * sizeof(int) - 1); \
+ temp += nbits; \
+ nbits ^= temp; \
+ nbits = JPEG_NBITS_NONZERO(nbits); \
+ /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
+ while (r >= 16 * 16) { \
+ r -= 16 * 16; \
+ PUT_BITS(actbl->ehufco[0xf0], actbl->ehufsi[0xf0]) \
+ } \
+ /* Emit Huffman symbol for run length / number of bits */ \
+ r += nbits; \
+ PUT_CODE(actbl->ehufco[r], actbl->ehufsi[r]) \
+ r = 0; \
+ } \
+}
+
+ /* One iteration for each value in jpeg_natural_order[] */
+ kloop(1); kloop(8); kloop(16); kloop(9); kloop(2); kloop(3);
+ kloop(10); kloop(17); kloop(24); kloop(32); kloop(25); kloop(18);
+ kloop(11); kloop(4); kloop(5); kloop(12); kloop(19); kloop(26);
+ kloop(33); kloop(40); kloop(48); kloop(41); kloop(34); kloop(27);
+ kloop(20); kloop(13); kloop(6); kloop(7); kloop(14); kloop(21);
+ kloop(28); kloop(35); kloop(42); kloop(49); kloop(56); kloop(57);
+ kloop(50); kloop(43); kloop(36); kloop(29); kloop(22); kloop(15);
+ kloop(23); kloop(30); kloop(37); kloop(44); kloop(51); kloop(58);
+ kloop(59); kloop(52); kloop(45); kloop(38); kloop(31); kloop(39);
+ kloop(46); kloop(53); kloop(60); kloop(61); kloop(54); kloop(47);
+ kloop(55); kloop(62); kloop(63);
+
+ /* If the last coef(s) were zero, emit an end-of-block code */
+ if (r > 0) {
+ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+ }
+ }
+
+ state->cur.put_buffer.c = put_buffer;
+ state->cur.free_bits = free_bits;
+ STORE_BUFFER()
+
+ return TRUE;
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(boolean)
+emit_restart(working_state *state, int restart_num)
+{
+ int ci;
+
+ if (!flush_bits(state))
+ return FALSE;
+
+ emit_byte(state, 0xFF, return FALSE);
+ emit_byte(state, JPEG_RST0 + restart_num, return FALSE);
+
+ /* Re-initialize DC predictions to 0 */
+ for (ci = 0; ci < state->cinfo->comps_in_scan; ci++)
+ state->cur.last_dc_val[ci] = 0;
+
+ /* The restart counter is not updated until we successfully write the MCU. */
+
+ return TRUE;
+}
+
+
+/*
+ * Encode and output one MCU's worth of Huffman-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
+ working_state state;
+ int blkn, ci;
+ jpeg_component_info *compptr;
+
+ /* Load up working state */
+ state.next_output_byte = cinfo->dest->next_output_byte;
+ state.free_in_buffer = cinfo->dest->free_in_buffer;
+ state.cur = entropy->saved;
+ state.cinfo = cinfo;
+ state.simd = entropy->simd;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ if (!emit_restart(&state, entropy->next_restart_num))
+ return FALSE;
+ }
+
+ /* Encode the MCU data blocks */
+ if (entropy->simd) {
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ ci = cinfo->MCU_membership[blkn];
+ compptr = cinfo->cur_comp_info[ci];
+ if (!encode_one_block_simd(&state,
+ MCU_data[blkn][0], state.cur.last_dc_val[ci],
+ entropy->dc_derived_tbls[compptr->dc_tbl_no],
+ entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+ return FALSE;
+ /* Update last_dc_val */
+ state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
+ }
+ } else {
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ ci = cinfo->MCU_membership[blkn];
+ compptr = cinfo->cur_comp_info[ci];
+ if (!encode_one_block(&state,
+ MCU_data[blkn][0], state.cur.last_dc_val[ci],
+ entropy->dc_derived_tbls[compptr->dc_tbl_no],
+ entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+ return FALSE;
+ /* Update last_dc_val */
+ state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
+ }
+ }
+
+ /* Completed MCU, so update state */
+ cinfo->dest->next_output_byte = state.next_output_byte;
+ cinfo->dest->free_in_buffer = state.free_in_buffer;
+ entropy->saved = state.cur;
+
+ /* Update restart-interval state too */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * Finish up at the end of a Huffman-compressed scan.
+ */
+
+METHODDEF(void)
+finish_pass_huff(j_compress_ptr cinfo)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
+ working_state state;
+
+ /* Load up working state ... flush_bits needs it */
+ state.next_output_byte = cinfo->dest->next_output_byte;
+ state.free_in_buffer = cinfo->dest->free_in_buffer;
+ state.cur = entropy->saved;
+ state.cinfo = cinfo;
+ state.simd = entropy->simd;
+
+ /* Flush out the last data */
+ if (!flush_bits(&state))
+ ERREXIT(cinfo, JERR_CANT_SUSPEND);
+
+ /* Update state */
+ cinfo->dest->next_output_byte = state.next_output_byte;
+ cinfo->dest->free_in_buffer = state.free_in_buffer;
+ entropy->saved = state.cur;
+}
+
+
+/*
+ * Huffman coding optimization.
+ *
+ * We first scan the supplied data and count the number of uses of each symbol
+ * that is to be Huffman-coded. (This process MUST agree with the code above.)
+ * Then we build a Huffman coding tree for the observed counts.
+ * Symbols which are not needed at all for the particular image are not
+ * assigned any code, which saves space in the DHT marker as well as in
+ * the compressed data.
+ */
+
+#ifdef ENTROPY_OPT_SUPPORTED
+
+
+/* Process a single block's worth of coefficients */
+
+LOCAL(void)
+htest_one_block(j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
+ long dc_counts[], long ac_counts[])
+{
+ register int temp;
+ register int nbits;
+ register int k, r;
+
+ /* Encode the DC coefficient difference per section F.1.2.1 */
+
+ temp = block[0] - last_dc_val;
+ if (temp < 0)
+ temp = -temp;
+
+ /* Find the number of bits needed for the magnitude of the coefficient */
+ nbits = 0;
+ while (temp) {
+ nbits++;
+ temp >>= 1;
+ }
+ /* Check for out-of-range coefficient values.
+ * Since we're encoding a difference, the range limit is twice as much.
+ */
+ if (nbits > MAX_COEF_BITS + 1)
+ ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+
+ /* Count the Huffman symbol for the number of bits */
+ dc_counts[nbits]++;
+
+ /* Encode the AC coefficients per section F.1.2.2 */
+
+ r = 0; /* r = run length of zeros */
+
+ for (k = 1; k < DCTSIZE2; k++) {
+ if ((temp = block[jpeg_natural_order[k]]) == 0) {
+ r++;
+ } else {
+ /* if run length > 15, must emit special run-length-16 codes (0xF0) */
+ while (r > 15) {
+ ac_counts[0xF0]++;
+ r -= 16;
+ }
+
+ /* Find the number of bits needed for the magnitude of the coefficient */
+ if (temp < 0)
+ temp = -temp;
+
+ /* Find the number of bits needed for the magnitude of the coefficient */
+ nbits = 1; /* there must be at least one 1 bit */
+ while ((temp >>= 1))
+ nbits++;
+ /* Check for out-of-range coefficient values */
+ if (nbits > MAX_COEF_BITS)
+ ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+
+ /* Count Huffman symbol for run length / number of bits */
+ ac_counts[(r << 4) + nbits]++;
+
+ r = 0;
+ }
+ }
+
+ /* If the last coef(s) were zero, emit an end-of-block code */
+ if (r > 0)
+ ac_counts[0]++;
+}
+
+
+/*
+ * Trial-encode one MCU's worth of Huffman-compressed coefficients.
+ * No data is actually output, so no suspension return is possible.
+ */
+
+METHODDEF(boolean)
+encode_mcu_gather(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
+ int blkn, ci;
+ jpeg_component_info *compptr;
+
+ /* Take care of restart intervals if needed */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ /* Re-initialize DC predictions to 0 */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++)
+ entropy->saved.last_dc_val[ci] = 0;
+ /* Update restart state */
+ entropy->restarts_to_go = cinfo->restart_interval;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ ci = cinfo->MCU_membership[blkn];
+ compptr = cinfo->cur_comp_info[ci];
+ htest_one_block(cinfo, MCU_data[blkn][0], entropy->saved.last_dc_val[ci],
+ entropy->dc_count_ptrs[compptr->dc_tbl_no],
+ entropy->ac_count_ptrs[compptr->ac_tbl_no]);
+ entropy->saved.last_dc_val[ci] = MCU_data[blkn][0][0];
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * Generate the best Huffman code table for the given counts, fill htbl.
+ * Note this is also used by jcphuff.c.
+ *
+ * The JPEG standard requires that no symbol be assigned a codeword of all
+ * one bits (so that padding bits added at the end of a compressed segment
+ * can't look like a valid code). Because of the canonical ordering of
+ * codewords, this just means that there must be an unused slot in the
+ * longest codeword length category. Annex K (Clause K.2) of
+ * Rec. ITU-T T.81 (1992) | ISO/IEC 10918-1:1994 suggests reserving such a slot
+ * by pretending that symbol 256 is a valid symbol with count 1. In theory
+ * that's not optimal; giving it count zero but including it in the symbol set
+ * anyway should give a better Huffman code. But the theoretically better code
+ * actually seems to come out worse in practice, because it produces more
+ * all-ones bytes (which incur stuffed zero bytes in the final file). In any
+ * case the difference is tiny.
+ *
+ * The JPEG standard requires Huffman codes to be no more than 16 bits long.
+ * If some symbols have a very small but nonzero probability, the Huffman tree
+ * must be adjusted to meet the code length restriction. We currently use
+ * the adjustment method suggested in JPEG section K.2. This method is *not*
+ * optimal; it may not choose the best possible limited-length code. But
+ * typically only very-low-frequency symbols will be given less-than-optimal
+ * lengths, so the code is almost optimal. Experimental comparisons against
+ * an optimal limited-length-code algorithm indicate that the difference is
+ * microscopic --- usually less than a hundredth of a percent of total size.
+ * So the extra complexity of an optimal algorithm doesn't seem worthwhile.
+ */
+
+GLOBAL(void)
+jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
+{
+#define MAX_CLEN 32 /* assumed maximum initial code length */
+ UINT8 bits[MAX_CLEN + 1]; /* bits[k] = # of symbols with code length k */
+ int codesize[257]; /* codesize[k] = code length of symbol k */
+ int others[257]; /* next symbol in current branch of tree */
+ int c1, c2;
+ int p, i, j;
+ long v;
+
+ /* This algorithm is explained in section K.2 of the JPEG standard */
+
+ memset(bits, 0, sizeof(bits));
+ memset(codesize, 0, sizeof(codesize));
+ for (i = 0; i < 257; i++)
+ others[i] = -1; /* init links to empty */
+
+ freq[256] = 1; /* make sure 256 has a nonzero count */
+ /* Including the pseudo-symbol 256 in the Huffman procedure guarantees
+ * that no real symbol is given code-value of all ones, because 256
+ * will be placed last in the largest codeword category.
+ */
+
+ /* Huffman's basic algorithm to assign optimal code lengths to symbols */
+
+ for (;;) {
+ /* Find the smallest nonzero frequency, set c1 = its symbol */
+ /* In case of ties, take the larger symbol number */
+ c1 = -1;
+ v = 1000000000L;
+ for (i = 0; i <= 256; i++) {
+ if (freq[i] && freq[i] <= v) {
+ v = freq[i];
+ c1 = i;
+ }
+ }
+
+ /* Find the next smallest nonzero frequency, set c2 = its symbol */
+ /* In case of ties, take the larger symbol number */
+ c2 = -1;
+ v = 1000000000L;
+ for (i = 0; i <= 256; i++) {
+ if (freq[i] && freq[i] <= v && i != c1) {
+ v = freq[i];
+ c2 = i;
+ }
+ }
+
+ /* Done if we've merged everything into one frequency */
+ if (c2 < 0)
+ break;
+
+ /* Else merge the two counts/trees */
+ freq[c1] += freq[c2];
+ freq[c2] = 0;
+
+ /* Increment the codesize of everything in c1's tree branch */
+ codesize[c1]++;
+ while (others[c1] >= 0) {
+ c1 = others[c1];
+ codesize[c1]++;
+ }
+
+ others[c1] = c2; /* chain c2 onto c1's tree branch */
+
+ /* Increment the codesize of everything in c2's tree branch */
+ codesize[c2]++;
+ while (others[c2] >= 0) {
+ c2 = others[c2];
+ codesize[c2]++;
+ }
+ }
+
+ /* Now count the number of symbols of each code length */
+ for (i = 0; i <= 256; i++) {
+ if (codesize[i]) {
+ /* The JPEG standard seems to think that this can't happen, */
+ /* but I'm paranoid... */
+ if (codesize[i] > MAX_CLEN)
+ ERREXIT(cinfo, JERR_HUFF_CLEN_OVERFLOW);
+
+ bits[codesize[i]]++;
+ }
+ }
+
+ /* JPEG doesn't allow symbols with code lengths over 16 bits, so if the pure
+ * Huffman procedure assigned any such lengths, we must adjust the coding.
+ * Here is what Rec. ITU-T T.81 | ISO/IEC 10918-1 says about how this next
+ * bit works: Since symbols are paired for the longest Huffman code, the
+ * symbols are removed from this length category two at a time. The prefix
+ * for the pair (which is one bit shorter) is allocated to one of the pair;
+ * then, skipping the BITS entry for that prefix length, a code word from the
+ * next shortest nonzero BITS entry is converted into a prefix for two code
+ * words one bit longer.
+ */
+
+ for (i = MAX_CLEN; i > 16; i--) {
+ while (bits[i] > 0) {
+ j = i - 2; /* find length of new prefix to be used */
+ while (bits[j] == 0)
+ j--;
+
+ bits[i] -= 2; /* remove two symbols */
+ bits[i - 1]++; /* one goes in this length */
+ bits[j + 1] += 2; /* two new symbols in this length */
+ bits[j]--; /* symbol of this length is now a prefix */
+ }
+ }
+
+ /* Remove the count for the pseudo-symbol 256 from the largest codelength */
+ while (bits[i] == 0) /* find largest codelength still in use */
+ i--;
+ bits[i]--;
+
+ /* Return final symbol counts (only for lengths 0..16) */
+ memcpy(htbl->bits, bits, sizeof(htbl->bits));
+
+ /* Return a list of the symbols sorted by code length */
+ /* It's not real clear to me why we don't need to consider the codelength
+ * changes made above, but Rec. ITU-T T.81 | ISO/IEC 10918-1 seems to think
+ * this works.
+ */
+ p = 0;
+ for (i = 1; i <= MAX_CLEN; i++) {
+ for (j = 0; j <= 255; j++) {
+ if (codesize[j] == i) {
+ htbl->huffval[p] = (UINT8)j;
+ p++;
+ }
+ }
+ }
+
+ /* Set sent_table FALSE so updated table will be written to JPEG file. */
+ htbl->sent_table = FALSE;
+}
+
+
+/*
+ * Finish up a statistics-gathering pass and create the new Huffman tables.
+ */
+
+METHODDEF(void)
+finish_pass_gather(j_compress_ptr cinfo)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
+ int ci, dctbl, actbl;
+ jpeg_component_info *compptr;
+ JHUFF_TBL **htblptr;
+ boolean did_dc[NUM_HUFF_TBLS];
+ boolean did_ac[NUM_HUFF_TBLS];
+
+ /* It's important not to apply jpeg_gen_optimal_table more than once
+ * per table, because it clobbers the input frequency counts!
+ */
+ memset(did_dc, 0, sizeof(did_dc));
+ memset(did_ac, 0, sizeof(did_ac));
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ dctbl = compptr->dc_tbl_no;
+ actbl = compptr->ac_tbl_no;
+ if (!did_dc[dctbl]) {
+ htblptr = &cinfo->dc_huff_tbl_ptrs[dctbl];
+ if (*htblptr == NULL)
+ *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
+ jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
+ did_dc[dctbl] = TRUE;
+ }
+ if (!did_ac[actbl]) {
+ htblptr = &cinfo->ac_huff_tbl_ptrs[actbl];
+ if (*htblptr == NULL)
+ *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
+ jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
+ did_ac[actbl] = TRUE;
+ }
+ }
+}
+
+
+#endif /* ENTROPY_OPT_SUPPORTED */
+
+
+/*
+ * Module initialization routine for Huffman entropy encoding.
+ */
+
+GLOBAL(void)
+jinit_huff_encoder(j_compress_ptr cinfo)
+{
+ huff_entropy_ptr entropy;
+ int i;
+
+ entropy = (huff_entropy_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(huff_entropy_encoder));
+ cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
+ entropy->pub.start_pass = start_pass_huff;
+
+ /* Mark tables unallocated */
+ for (i = 0; i < NUM_HUFF_TBLS; i++) {
+ entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
+#ifdef ENTROPY_OPT_SUPPORTED
+ entropy->dc_count_ptrs[i] = entropy->ac_count_ptrs[i] = NULL;
+#endif
+ }
+}
diff --git a/media/libjpeg/jchuff.h b/media/libjpeg/jchuff.h
new file mode 100644
index 0000000000..da7809a94b
--- /dev/null
+++ b/media/libjpeg/jchuff.h
@@ -0,0 +1,50 @@
+/*
+ * jchuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains declarations for Huffman entropy encoding routines
+ * that are shared between the sequential encoder (jchuff.c) and the
+ * progressive encoder (jcphuff.c). No other modules need to see these.
+ */
+
+/* The legal range of a DCT coefficient is
+ * -1024 .. +1023 for 8-bit data;
+ * -16384 .. +16383 for 12-bit data.
+ * Hence the magnitude should always fit in 10 or 14 bits respectively.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define MAX_COEF_BITS 10
+#else
+#define MAX_COEF_BITS 14
+#endif
+
+/* The progressive Huffman encoder uses an unsigned 16-bit data type to store
+ * absolute values of coefficients, because it is possible to inject a
+ * coefficient value of -32768 into the encoder by attempting to transform a
+ * malformed 12-bit JPEG image, and the absolute value of -32768 would overflow
+ * a signed 16-bit integer.
+ */
+typedef unsigned short UJCOEF;
+
+/* Derived data constructed for each Huffman table */
+
+typedef struct {
+ unsigned int ehufco[256]; /* code for each symbol */
+ char ehufsi[256]; /* length of code for each symbol */
+ /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */
+} c_derived_tbl;
+
+/* Expand a Huffman table definition into the derived format */
+EXTERN(void) jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC,
+ int tblno, c_derived_tbl **pdtbl);
+
+/* Generate an optimal table definition given the specified counts */
+EXTERN(void) jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl,
+ long freq[]);
diff --git a/media/libjpeg/jcicc.c b/media/libjpeg/jcicc.c
new file mode 100644
index 0000000000..11037ff694
--- /dev/null
+++ b/media/libjpeg/jcicc.c
@@ -0,0 +1,105 @@
+/*
+ * jcicc.c
+ *
+ * Copyright (C) 1997-1998, Thomas G. Lane, Todd Newman.
+ * Copyright (C) 2017, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides code to write International Color Consortium (ICC) device
+ * profiles embedded in JFIF JPEG image files. The ICC has defined a standard
+ * for including such data in JPEG "APP2" markers. The code given here does
+ * not know anything about the internal structure of the ICC profile data; it
+ * just knows how to embed the profile data in a JPEG file while writing it.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+
+/*
+ * Since an ICC profile can be larger than the maximum size of a JPEG marker
+ * (64K), we need provisions to split it into multiple markers. The format
+ * defined by the ICC specifies one or more APP2 markers containing the
+ * following data:
+ * Identifying string ASCII "ICC_PROFILE\0" (12 bytes)
+ * Marker sequence number 1 for first APP2, 2 for next, etc (1 byte)
+ * Number of markers Total number of APP2's used (1 byte)
+ * Profile data (remainder of APP2 data)
+ * Decoders should use the marker sequence numbers to reassemble the profile,
+ * rather than assuming that the APP2 markers appear in the correct sequence.
+ */
+
+#define ICC_MARKER (JPEG_APP0 + 2) /* JPEG marker code for ICC */
+#define ICC_OVERHEAD_LEN 14 /* size of non-profile data in APP2 */
+#define MAX_BYTES_IN_MARKER 65533 /* maximum data len of a JPEG marker */
+#define MAX_DATA_BYTES_IN_MARKER (MAX_BYTES_IN_MARKER - ICC_OVERHEAD_LEN)
+
+
+/*
+ * This routine writes the given ICC profile data into a JPEG file. It *must*
+ * be called AFTER calling jpeg_start_compress() and BEFORE the first call to
+ * jpeg_write_scanlines(). (This ordering ensures that the APP2 marker(s) will
+ * appear after the SOI and JFIF or Adobe markers, but before all else.)
+ */
+
+GLOBAL(void)
+jpeg_write_icc_profile(j_compress_ptr cinfo, const JOCTET *icc_data_ptr,
+ unsigned int icc_data_len)
+{
+ unsigned int num_markers; /* total number of markers we'll write */
+ int cur_marker = 1; /* per spec, counting starts at 1 */
+ unsigned int length; /* number of bytes to write in this marker */
+
+ if (icc_data_ptr == NULL || icc_data_len == 0)
+ ERREXIT(cinfo, JERR_BUFFER_SIZE);
+ if (cinfo->global_state < CSTATE_SCANNING)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ /* Calculate the number of markers we'll need, rounding up of course */
+ num_markers = icc_data_len / MAX_DATA_BYTES_IN_MARKER;
+ if (num_markers * MAX_DATA_BYTES_IN_MARKER != icc_data_len)
+ num_markers++;
+
+ while (icc_data_len > 0) {
+ /* length of profile to put in this marker */
+ length = icc_data_len;
+ if (length > MAX_DATA_BYTES_IN_MARKER)
+ length = MAX_DATA_BYTES_IN_MARKER;
+ icc_data_len -= length;
+
+ /* Write the JPEG marker header (APP2 code and marker length) */
+ jpeg_write_m_header(cinfo, ICC_MARKER,
+ (unsigned int)(length + ICC_OVERHEAD_LEN));
+
+ /* Write the marker identifying string "ICC_PROFILE" (null-terminated). We
+ * code it in this less-than-transparent way so that the code works even if
+ * the local character set is not ASCII.
+ */
+ jpeg_write_m_byte(cinfo, 0x49);
+ jpeg_write_m_byte(cinfo, 0x43);
+ jpeg_write_m_byte(cinfo, 0x43);
+ jpeg_write_m_byte(cinfo, 0x5F);
+ jpeg_write_m_byte(cinfo, 0x50);
+ jpeg_write_m_byte(cinfo, 0x52);
+ jpeg_write_m_byte(cinfo, 0x4F);
+ jpeg_write_m_byte(cinfo, 0x46);
+ jpeg_write_m_byte(cinfo, 0x49);
+ jpeg_write_m_byte(cinfo, 0x4C);
+ jpeg_write_m_byte(cinfo, 0x45);
+ jpeg_write_m_byte(cinfo, 0x0);
+
+ /* Add the sequencing info */
+ jpeg_write_m_byte(cinfo, cur_marker);
+ jpeg_write_m_byte(cinfo, (int)num_markers);
+
+ /* Add the profile data */
+ while (length--) {
+ jpeg_write_m_byte(cinfo, *icc_data_ptr);
+ icc_data_ptr++;
+ }
+ cur_marker++;
+ }
+}
diff --git a/media/libjpeg/jcinit.c b/media/libjpeg/jcinit.c
new file mode 100644
index 0000000000..157353a22e
--- /dev/null
+++ b/media/libjpeg/jcinit.c
@@ -0,0 +1,80 @@
+/*
+ * jcinit.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains initialization logic for the JPEG compressor.
+ * This routine is in charge of selecting the modules to be executed and
+ * making an initialization call to each one.
+ *
+ * Logically, this code belongs in jcmaster.c. It's split out because
+ * linking this routine implies linking the entire compression library.
+ * For a transcoding-only application, we want to be able to use jcmaster.c
+ * without linking in the whole library.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jpegcomp.h"
+
+
+/*
+ * Master selection of compression modules.
+ * This is done once at the start of processing an image. We determine
+ * which modules will be used and give them appropriate initialization calls.
+ */
+
+GLOBAL(void)
+jinit_compress_master(j_compress_ptr cinfo)
+{
+ /* Initialize master control (includes parameter checking/processing) */
+ jinit_c_master_control(cinfo, FALSE /* full compression */);
+
+ /* Preprocessing */
+ if (!cinfo->raw_data_in) {
+ jinit_color_converter(cinfo);
+ jinit_downsampler(cinfo);
+ jinit_c_prep_controller(cinfo, FALSE /* never need full buffer here */);
+ }
+ /* Forward DCT */
+ jinit_forward_dct(cinfo);
+ /* Entropy encoding: either Huffman or arithmetic coding. */
+ if (cinfo->arith_code) {
+#ifdef C_ARITH_CODING_SUPPORTED
+ jinit_arith_encoder(cinfo);
+#else
+ ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+#endif
+ } else {
+ if (cinfo->progressive_mode) {
+#ifdef C_PROGRESSIVE_SUPPORTED
+ jinit_phuff_encoder(cinfo);
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+ } else
+ jinit_huff_encoder(cinfo);
+ }
+
+ /* Need a full-image coefficient buffer in any multi-pass mode. */
+ jinit_c_coef_controller(cinfo, (boolean)(cinfo->num_scans > 1 ||
+ cinfo->optimize_coding));
+ jinit_c_main_controller(cinfo, FALSE /* never need full buffer here */);
+
+ jinit_marker_writer(cinfo);
+
+ /* We can now tell the memory manager to allocate virtual arrays. */
+ (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
+
+ /* Write the datastream header (SOI) immediately.
+ * Frame and scan headers are postponed till later.
+ * This lets application insert special markers after the SOI.
+ */
+ (*cinfo->marker->write_file_header) (cinfo);
+}
diff --git a/media/libjpeg/jcmainct.c b/media/libjpeg/jcmainct.c
new file mode 100644
index 0000000000..3f23028c46
--- /dev/null
+++ b/media/libjpeg/jcmainct.c
@@ -0,0 +1,162 @@
+/*
+ * jcmainct.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the main buffer controller for compression.
+ * The main buffer lies between the pre-processor and the JPEG
+ * compressor proper; it holds downsampled data in the JPEG colorspace.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Private buffer controller object */
+
+typedef struct {
+ struct jpeg_c_main_controller pub; /* public fields */
+
+ JDIMENSION cur_iMCU_row; /* number of current iMCU row */
+ JDIMENSION rowgroup_ctr; /* counts row groups received in iMCU row */
+ boolean suspended; /* remember if we suspended output */
+ J_BUF_MODE pass_mode; /* current operating mode */
+
+ /* If using just a strip buffer, this points to the entire set of buffers
+ * (we allocate one for each component). In the full-image case, this
+ * points to the currently accessible strips of the virtual arrays.
+ */
+ JSAMPARRAY buffer[MAX_COMPONENTS];
+} my_main_controller;
+
+typedef my_main_controller *my_main_ptr;
+
+
+/* Forward declarations */
+METHODDEF(void) process_data_simple_main(j_compress_ptr cinfo,
+ JSAMPARRAY input_buf,
+ JDIMENSION *in_row_ctr,
+ JDIMENSION in_rows_avail);
+
+
+/*
+ * Initialize for a processing pass.
+ */
+
+METHODDEF(void)
+start_pass_main(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+{
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+
+ /* Do nothing in raw-data mode. */
+ if (cinfo->raw_data_in)
+ return;
+
+ if (pass_mode != JBUF_PASS_THRU)
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+
+ main_ptr->cur_iMCU_row = 0; /* initialize counters */
+ main_ptr->rowgroup_ctr = 0;
+ main_ptr->suspended = FALSE;
+ main_ptr->pass_mode = pass_mode; /* save mode for use by process_data */
+ main_ptr->pub.process_data = process_data_simple_main;
+}
+
+
+/*
+ * Process some data.
+ * This routine handles the simple pass-through mode,
+ * where we have only a strip buffer.
+ */
+
+METHODDEF(void)
+process_data_simple_main(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail)
+{
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+
+ while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
+ /* Read input data if we haven't filled the main buffer yet */
+ if (main_ptr->rowgroup_ctr < DCTSIZE)
+ (*cinfo->prep->pre_process_data) (cinfo, input_buf, in_row_ctr,
+ in_rows_avail, main_ptr->buffer,
+ &main_ptr->rowgroup_ctr,
+ (JDIMENSION)DCTSIZE);
+
+ /* If we don't have a full iMCU row buffered, return to application for
+ * more data. Note that preprocessor will always pad to fill the iMCU row
+ * at the bottom of the image.
+ */
+ if (main_ptr->rowgroup_ctr != DCTSIZE)
+ return;
+
+ /* Send the completed row to the compressor */
+ if (!(*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
+ /* If compressor did not consume the whole row, then we must need to
+ * suspend processing and return to the application. In this situation
+ * we pretend we didn't yet consume the last input row; otherwise, if
+ * it happened to be the last row of the image, the application would
+ * think we were done.
+ */
+ if (!main_ptr->suspended) {
+ (*in_row_ctr)--;
+ main_ptr->suspended = TRUE;
+ }
+ return;
+ }
+ /* We did finish the row. Undo our little suspension hack if a previous
+ * call suspended; then mark the main buffer empty.
+ */
+ if (main_ptr->suspended) {
+ (*in_row_ctr)++;
+ main_ptr->suspended = FALSE;
+ }
+ main_ptr->rowgroup_ctr = 0;
+ main_ptr->cur_iMCU_row++;
+ }
+}
+
+
+/*
+ * Initialize main buffer controller.
+ */
+
+GLOBAL(void)
+jinit_c_main_controller(j_compress_ptr cinfo, boolean need_full_buffer)
+{
+ my_main_ptr main_ptr;
+ int ci;
+ jpeg_component_info *compptr;
+
+ main_ptr = (my_main_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_main_controller));
+ cinfo->main = (struct jpeg_c_main_controller *)main_ptr;
+ main_ptr->pub.start_pass = start_pass_main;
+
+ /* We don't need to create a buffer in raw-data mode. */
+ if (cinfo->raw_data_in)
+ return;
+
+ /* Create the buffer. It holds downsampled data, so each component
+ * may be of a different size.
+ */
+ if (need_full_buffer) {
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+ } else {
+ /* Allocate a strip buffer for each component */
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ compptr->width_in_blocks * DCTSIZE,
+ (JDIMENSION)(compptr->v_samp_factor * DCTSIZE));
+ }
+ }
+}
diff --git a/media/libjpeg/jcmarker.c b/media/libjpeg/jcmarker.c
new file mode 100644
index 0000000000..801fbab4ef
--- /dev/null
+++ b/media/libjpeg/jcmarker.c
@@ -0,0 +1,664 @@
+/*
+ * jcmarker.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2010 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains routines to write JPEG datastream markers.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jpegcomp.h"
+
+
+typedef enum { /* JPEG marker codes */
+ M_SOF0 = 0xc0,
+ M_SOF1 = 0xc1,
+ M_SOF2 = 0xc2,
+ M_SOF3 = 0xc3,
+
+ M_SOF5 = 0xc5,
+ M_SOF6 = 0xc6,
+ M_SOF7 = 0xc7,
+
+ M_JPG = 0xc8,
+ M_SOF9 = 0xc9,
+ M_SOF10 = 0xca,
+ M_SOF11 = 0xcb,
+
+ M_SOF13 = 0xcd,
+ M_SOF14 = 0xce,
+ M_SOF15 = 0xcf,
+
+ M_DHT = 0xc4,
+
+ M_DAC = 0xcc,
+
+ M_RST0 = 0xd0,
+ M_RST1 = 0xd1,
+ M_RST2 = 0xd2,
+ M_RST3 = 0xd3,
+ M_RST4 = 0xd4,
+ M_RST5 = 0xd5,
+ M_RST6 = 0xd6,
+ M_RST7 = 0xd7,
+
+ M_SOI = 0xd8,
+ M_EOI = 0xd9,
+ M_SOS = 0xda,
+ M_DQT = 0xdb,
+ M_DNL = 0xdc,
+ M_DRI = 0xdd,
+ M_DHP = 0xde,
+ M_EXP = 0xdf,
+
+ M_APP0 = 0xe0,
+ M_APP1 = 0xe1,
+ M_APP2 = 0xe2,
+ M_APP3 = 0xe3,
+ M_APP4 = 0xe4,
+ M_APP5 = 0xe5,
+ M_APP6 = 0xe6,
+ M_APP7 = 0xe7,
+ M_APP8 = 0xe8,
+ M_APP9 = 0xe9,
+ M_APP10 = 0xea,
+ M_APP11 = 0xeb,
+ M_APP12 = 0xec,
+ M_APP13 = 0xed,
+ M_APP14 = 0xee,
+ M_APP15 = 0xef,
+
+ M_JPG0 = 0xf0,
+ M_JPG13 = 0xfd,
+ M_COM = 0xfe,
+
+ M_TEM = 0x01,
+
+ M_ERROR = 0x100
+} JPEG_MARKER;
+
+
+/* Private state */
+
+typedef struct {
+ struct jpeg_marker_writer pub; /* public fields */
+
+ unsigned int last_restart_interval; /* last DRI value emitted; 0 after SOI */
+} my_marker_writer;
+
+typedef my_marker_writer *my_marker_ptr;
+
+
+/*
+ * Basic output routines.
+ *
+ * Note that we do not support suspension while writing a marker.
+ * Therefore, an application using suspension must ensure that there is
+ * enough buffer space for the initial markers (typ. 600-700 bytes) before
+ * calling jpeg_start_compress, and enough space to write the trailing EOI
+ * (a few bytes) before calling jpeg_finish_compress. Multipass compression
+ * modes are not supported at all with suspension, so those two are the only
+ * points where markers will be written.
+ */
+
+LOCAL(void)
+emit_byte(j_compress_ptr cinfo, int val)
+/* Emit a byte */
+{
+ struct jpeg_destination_mgr *dest = cinfo->dest;
+
+ *(dest->next_output_byte)++ = (JOCTET)val;
+ if (--dest->free_in_buffer == 0) {
+ if (!(*dest->empty_output_buffer) (cinfo))
+ ERREXIT(cinfo, JERR_CANT_SUSPEND);
+ }
+}
+
+
+LOCAL(void)
+emit_marker(j_compress_ptr cinfo, JPEG_MARKER mark)
+/* Emit a marker code */
+{
+ emit_byte(cinfo, 0xFF);
+ emit_byte(cinfo, (int)mark);
+}
+
+
+LOCAL(void)
+emit_2bytes(j_compress_ptr cinfo, int value)
+/* Emit a 2-byte integer; these are always MSB first in JPEG files */
+{
+ emit_byte(cinfo, (value >> 8) & 0xFF);
+ emit_byte(cinfo, value & 0xFF);
+}
+
+
+/*
+ * Routines to write specific marker types.
+ */
+
+LOCAL(int)
+emit_dqt(j_compress_ptr cinfo, int index)
+/* Emit a DQT marker */
+/* Returns the precision used (0 = 8bits, 1 = 16bits) for baseline checking */
+{
+ JQUANT_TBL *qtbl = cinfo->quant_tbl_ptrs[index];
+ int prec;
+ int i;
+
+ if (qtbl == NULL)
+ ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, index);
+
+ prec = 0;
+ for (i = 0; i < DCTSIZE2; i++) {
+ if (qtbl->quantval[i] > 255)
+ prec = 1;
+ }
+
+ if (!qtbl->sent_table) {
+ emit_marker(cinfo, M_DQT);
+
+ emit_2bytes(cinfo, prec ? DCTSIZE2 * 2 + 1 + 2 : DCTSIZE2 + 1 + 2);
+
+ emit_byte(cinfo, index + (prec << 4));
+
+ for (i = 0; i < DCTSIZE2; i++) {
+ /* The table entries must be emitted in zigzag order. */
+ unsigned int qval = qtbl->quantval[jpeg_natural_order[i]];
+ if (prec)
+ emit_byte(cinfo, (int)(qval >> 8));
+ emit_byte(cinfo, (int)(qval & 0xFF));
+ }
+
+ qtbl->sent_table = TRUE;
+ }
+
+ return prec;
+}
+
+
+LOCAL(void)
+emit_dht(j_compress_ptr cinfo, int index, boolean is_ac)
+/* Emit a DHT marker */
+{
+ JHUFF_TBL *htbl;
+ int length, i;
+
+ if (is_ac) {
+ htbl = cinfo->ac_huff_tbl_ptrs[index];
+ index += 0x10; /* output index has AC bit set */
+ } else {
+ htbl = cinfo->dc_huff_tbl_ptrs[index];
+ }
+
+ if (htbl == NULL)
+ ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, index);
+
+ if (!htbl->sent_table) {
+ emit_marker(cinfo, M_DHT);
+
+ length = 0;
+ for (i = 1; i <= 16; i++)
+ length += htbl->bits[i];
+
+ emit_2bytes(cinfo, length + 2 + 1 + 16);
+ emit_byte(cinfo, index);
+
+ for (i = 1; i <= 16; i++)
+ emit_byte(cinfo, htbl->bits[i]);
+
+ for (i = 0; i < length; i++)
+ emit_byte(cinfo, htbl->huffval[i]);
+
+ htbl->sent_table = TRUE;
+ }
+}
+
+
+LOCAL(void)
+emit_dac(j_compress_ptr cinfo)
+/* Emit a DAC marker */
+/* Since the useful info is so small, we want to emit all the tables in */
+/* one DAC marker. Therefore this routine does its own scan of the table. */
+{
+#ifdef C_ARITH_CODING_SUPPORTED
+ char dc_in_use[NUM_ARITH_TBLS];
+ char ac_in_use[NUM_ARITH_TBLS];
+ int length, i;
+ jpeg_component_info *compptr;
+
+ for (i = 0; i < NUM_ARITH_TBLS; i++)
+ dc_in_use[i] = ac_in_use[i] = 0;
+
+ for (i = 0; i < cinfo->comps_in_scan; i++) {
+ compptr = cinfo->cur_comp_info[i];
+ /* DC needs no table for refinement scan */
+ if (cinfo->Ss == 0 && cinfo->Ah == 0)
+ dc_in_use[compptr->dc_tbl_no] = 1;
+ /* AC needs no table when not present */
+ if (cinfo->Se)
+ ac_in_use[compptr->ac_tbl_no] = 1;
+ }
+
+ length = 0;
+ for (i = 0; i < NUM_ARITH_TBLS; i++)
+ length += dc_in_use[i] + ac_in_use[i];
+
+ if (length) {
+ emit_marker(cinfo, M_DAC);
+
+ emit_2bytes(cinfo, length * 2 + 2);
+
+ for (i = 0; i < NUM_ARITH_TBLS; i++) {
+ if (dc_in_use[i]) {
+ emit_byte(cinfo, i);
+ emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i] << 4));
+ }
+ if (ac_in_use[i]) {
+ emit_byte(cinfo, i + 0x10);
+ emit_byte(cinfo, cinfo->arith_ac_K[i]);
+ }
+ }
+ }
+#endif /* C_ARITH_CODING_SUPPORTED */
+}
+
+
+LOCAL(void)
+emit_dri(j_compress_ptr cinfo)
+/* Emit a DRI marker */
+{
+ emit_marker(cinfo, M_DRI);
+
+ emit_2bytes(cinfo, 4); /* fixed length */
+
+ emit_2bytes(cinfo, (int)cinfo->restart_interval);
+}
+
+
+LOCAL(void)
+emit_sof(j_compress_ptr cinfo, JPEG_MARKER code)
+/* Emit a SOF marker */
+{
+ int ci;
+ jpeg_component_info *compptr;
+
+ emit_marker(cinfo, code);
+
+ emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */
+
+ /* Make sure image isn't bigger than SOF field can handle */
+ if ((long)cinfo->_jpeg_height > 65535L || (long)cinfo->_jpeg_width > 65535L)
+ ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)65535);
+
+ emit_byte(cinfo, cinfo->data_precision);
+ emit_2bytes(cinfo, (int)cinfo->_jpeg_height);
+ emit_2bytes(cinfo, (int)cinfo->_jpeg_width);
+
+ emit_byte(cinfo, cinfo->num_components);
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ emit_byte(cinfo, compptr->component_id);
+ emit_byte(cinfo, (compptr->h_samp_factor << 4) + compptr->v_samp_factor);
+ emit_byte(cinfo, compptr->quant_tbl_no);
+ }
+}
+
+
+LOCAL(void)
+emit_sos(j_compress_ptr cinfo)
+/* Emit a SOS marker */
+{
+ int i, td, ta;
+ jpeg_component_info *compptr;
+
+ emit_marker(cinfo, M_SOS);
+
+ emit_2bytes(cinfo, 2 * cinfo->comps_in_scan + 2 + 1 + 3); /* length */
+
+ emit_byte(cinfo, cinfo->comps_in_scan);
+
+ for (i = 0; i < cinfo->comps_in_scan; i++) {
+ compptr = cinfo->cur_comp_info[i];
+ emit_byte(cinfo, compptr->component_id);
+
+ /* We emit 0 for unused field(s); this is recommended by the P&M text
+ * but does not seem to be specified in the standard.
+ */
+
+ /* DC needs no table for refinement scan */
+ td = cinfo->Ss == 0 && cinfo->Ah == 0 ? compptr->dc_tbl_no : 0;
+ /* AC needs no table when not present */
+ ta = cinfo->Se ? compptr->ac_tbl_no : 0;
+
+ emit_byte(cinfo, (td << 4) + ta);
+ }
+
+ emit_byte(cinfo, cinfo->Ss);
+ emit_byte(cinfo, cinfo->Se);
+ emit_byte(cinfo, (cinfo->Ah << 4) + cinfo->Al);
+}
+
+
+LOCAL(void)
+emit_jfif_app0(j_compress_ptr cinfo)
+/* Emit a JFIF-compliant APP0 marker */
+{
+ /*
+ * Length of APP0 block (2 bytes)
+ * Block ID (4 bytes - ASCII "JFIF")
+ * Zero byte (1 byte to terminate the ID string)
+ * Version Major, Minor (2 bytes - major first)
+ * Units (1 byte - 0x00 = none, 0x01 = inch, 0x02 = cm)
+ * Xdpu (2 bytes - dots per unit horizontal)
+ * Ydpu (2 bytes - dots per unit vertical)
+ * Thumbnail X size (1 byte)
+ * Thumbnail Y size (1 byte)
+ */
+
+ emit_marker(cinfo, M_APP0);
+
+ emit_2bytes(cinfo, 2 + 4 + 1 + 2 + 1 + 2 + 2 + 1 + 1); /* length */
+
+ emit_byte(cinfo, 0x4A); /* Identifier: ASCII "JFIF" */
+ emit_byte(cinfo, 0x46);
+ emit_byte(cinfo, 0x49);
+ emit_byte(cinfo, 0x46);
+ emit_byte(cinfo, 0);
+ emit_byte(cinfo, cinfo->JFIF_major_version); /* Version fields */
+ emit_byte(cinfo, cinfo->JFIF_minor_version);
+ emit_byte(cinfo, cinfo->density_unit); /* Pixel size information */
+ emit_2bytes(cinfo, (int)cinfo->X_density);
+ emit_2bytes(cinfo, (int)cinfo->Y_density);
+ emit_byte(cinfo, 0); /* No thumbnail image */
+ emit_byte(cinfo, 0);
+}
+
+
+LOCAL(void)
+emit_adobe_app14(j_compress_ptr cinfo)
+/* Emit an Adobe APP14 marker */
+{
+ /*
+ * Length of APP14 block (2 bytes)
+ * Block ID (5 bytes - ASCII "Adobe")
+ * Version Number (2 bytes - currently 100)
+ * Flags0 (2 bytes - currently 0)
+ * Flags1 (2 bytes - currently 0)
+ * Color transform (1 byte)
+ *
+ * Although Adobe TN 5116 mentions Version = 101, all the Adobe files
+ * now in circulation seem to use Version = 100, so that's what we write.
+ *
+ * We write the color transform byte as 1 if the JPEG color space is
+ * YCbCr, 2 if it's YCCK, 0 otherwise. Adobe's definition has to do with
+ * whether the encoder performed a transformation, which is pretty useless.
+ */
+
+ emit_marker(cinfo, M_APP14);
+
+ emit_2bytes(cinfo, 2 + 5 + 2 + 2 + 2 + 1); /* length */
+
+ emit_byte(cinfo, 0x41); /* Identifier: ASCII "Adobe" */
+ emit_byte(cinfo, 0x64);
+ emit_byte(cinfo, 0x6F);
+ emit_byte(cinfo, 0x62);
+ emit_byte(cinfo, 0x65);
+ emit_2bytes(cinfo, 100); /* Version */
+ emit_2bytes(cinfo, 0); /* Flags0 */
+ emit_2bytes(cinfo, 0); /* Flags1 */
+ switch (cinfo->jpeg_color_space) {
+ case JCS_YCbCr:
+ emit_byte(cinfo, 1); /* Color transform = 1 */
+ break;
+ case JCS_YCCK:
+ emit_byte(cinfo, 2); /* Color transform = 2 */
+ break;
+ default:
+ emit_byte(cinfo, 0); /* Color transform = 0 */
+ break;
+ }
+}
+
+
+/*
+ * These routines allow writing an arbitrary marker with parameters.
+ * The only intended use is to emit COM or APPn markers after calling
+ * write_file_header and before calling write_frame_header.
+ * Other uses are not guaranteed to produce desirable results.
+ * Counting the parameter bytes properly is the caller's responsibility.
+ */
+
+METHODDEF(void)
+write_marker_header(j_compress_ptr cinfo, int marker, unsigned int datalen)
+/* Emit an arbitrary marker header */
+{
+ if (datalen > (unsigned int)65533) /* safety check */
+ ERREXIT(cinfo, JERR_BAD_LENGTH);
+
+ emit_marker(cinfo, (JPEG_MARKER)marker);
+
+ emit_2bytes(cinfo, (int)(datalen + 2)); /* total length */
+}
+
+METHODDEF(void)
+write_marker_byte(j_compress_ptr cinfo, int val)
+/* Emit one byte of marker parameters following write_marker_header */
+{
+ emit_byte(cinfo, val);
+}
+
+
+/*
+ * Write datastream header.
+ * This consists of an SOI and optional APPn markers.
+ * We recommend use of the JFIF marker, but not the Adobe marker,
+ * when using YCbCr or grayscale data. The JFIF marker should NOT
+ * be used for any other JPEG colorspace. The Adobe marker is helpful
+ * to distinguish RGB, CMYK, and YCCK colorspaces.
+ * Note that an application can write additional header markers after
+ * jpeg_start_compress returns.
+ */
+
+METHODDEF(void)
+write_file_header(j_compress_ptr cinfo)
+{
+ my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
+
+ emit_marker(cinfo, M_SOI); /* first the SOI */
+
+ /* SOI is defined to reset restart interval to 0 */
+ marker->last_restart_interval = 0;
+
+ if (cinfo->write_JFIF_header) /* next an optional JFIF APP0 */
+ emit_jfif_app0(cinfo);
+ if (cinfo->write_Adobe_marker) /* next an optional Adobe APP14 */
+ emit_adobe_app14(cinfo);
+}
+
+
+/*
+ * Write frame header.
+ * This consists of DQT and SOFn markers.
+ * Note that we do not emit the SOF until we have emitted the DQT(s).
+ * This avoids compatibility problems with incorrect implementations that
+ * try to error-check the quant table numbers as soon as they see the SOF.
+ */
+
+METHODDEF(void)
+write_frame_header(j_compress_ptr cinfo)
+{
+ int ci, prec;
+ boolean is_baseline;
+ jpeg_component_info *compptr;
+
+ /* Emit DQT for each quantization table.
+ * Note that emit_dqt() suppresses any duplicate tables.
+ */
+ prec = 0;
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ prec += emit_dqt(cinfo, compptr->quant_tbl_no);
+ }
+ /* now prec is nonzero iff there are any 16-bit quant tables. */
+
+ /* Check for a non-baseline specification.
+ * Note we assume that Huffman table numbers won't be changed later.
+ */
+ if (cinfo->arith_code || cinfo->progressive_mode ||
+ cinfo->data_precision != 8) {
+ is_baseline = FALSE;
+ } else {
+ is_baseline = TRUE;
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ if (compptr->dc_tbl_no > 1 || compptr->ac_tbl_no > 1)
+ is_baseline = FALSE;
+ }
+ if (prec && is_baseline) {
+ is_baseline = FALSE;
+ /* If it's baseline except for quantizer size, warn the user */
+ TRACEMS(cinfo, 0, JTRC_16BIT_TABLES);
+ }
+ }
+
+ /* Emit the proper SOF marker */
+ if (cinfo->arith_code) {
+ if (cinfo->progressive_mode)
+ emit_sof(cinfo, M_SOF10); /* SOF code for progressive arithmetic */
+ else
+ emit_sof(cinfo, M_SOF9); /* SOF code for sequential arithmetic */
+ } else {
+ if (cinfo->progressive_mode)
+ emit_sof(cinfo, M_SOF2); /* SOF code for progressive Huffman */
+ else if (is_baseline)
+ emit_sof(cinfo, M_SOF0); /* SOF code for baseline implementation */
+ else
+ emit_sof(cinfo, M_SOF1); /* SOF code for non-baseline Huffman file */
+ }
+}
+
+
+/*
+ * Write scan header.
+ * This consists of DHT or DAC markers, optional DRI, and SOS.
+ * Compressed data will be written following the SOS.
+ */
+
+METHODDEF(void)
+write_scan_header(j_compress_ptr cinfo)
+{
+ my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
+ int i;
+ jpeg_component_info *compptr;
+
+ if (cinfo->arith_code) {
+ /* Emit arith conditioning info. We may have some duplication
+ * if the file has multiple scans, but it's so small it's hardly
+ * worth worrying about.
+ */
+ emit_dac(cinfo);
+ } else {
+ /* Emit Huffman tables.
+ * Note that emit_dht() suppresses any duplicate tables.
+ */
+ for (i = 0; i < cinfo->comps_in_scan; i++) {
+ compptr = cinfo->cur_comp_info[i];
+ /* DC needs no table for refinement scan */
+ if (cinfo->Ss == 0 && cinfo->Ah == 0)
+ emit_dht(cinfo, compptr->dc_tbl_no, FALSE);
+ /* AC needs no table when not present */
+ if (cinfo->Se)
+ emit_dht(cinfo, compptr->ac_tbl_no, TRUE);
+ }
+ }
+
+ /* Emit DRI if required --- note that DRI value could change for each scan.
+ * We avoid wasting space with unnecessary DRIs, however.
+ */
+ if (cinfo->restart_interval != marker->last_restart_interval) {
+ emit_dri(cinfo);
+ marker->last_restart_interval = cinfo->restart_interval;
+ }
+
+ emit_sos(cinfo);
+}
+
+
+/*
+ * Write datastream trailer.
+ */
+
+METHODDEF(void)
+write_file_trailer(j_compress_ptr cinfo)
+{
+ emit_marker(cinfo, M_EOI);
+}
+
+
+/*
+ * Write an abbreviated table-specification datastream.
+ * This consists of SOI, DQT and DHT tables, and EOI.
+ * Any table that is defined and not marked sent_table = TRUE will be
+ * emitted. Note that all tables will be marked sent_table = TRUE at exit.
+ */
+
+METHODDEF(void)
+write_tables_only(j_compress_ptr cinfo)
+{
+ int i;
+
+ emit_marker(cinfo, M_SOI);
+
+ for (i = 0; i < NUM_QUANT_TBLS; i++) {
+ if (cinfo->quant_tbl_ptrs[i] != NULL)
+ (void)emit_dqt(cinfo, i);
+ }
+
+ if (!cinfo->arith_code) {
+ for (i = 0; i < NUM_HUFF_TBLS; i++) {
+ if (cinfo->dc_huff_tbl_ptrs[i] != NULL)
+ emit_dht(cinfo, i, FALSE);
+ if (cinfo->ac_huff_tbl_ptrs[i] != NULL)
+ emit_dht(cinfo, i, TRUE);
+ }
+ }
+
+ emit_marker(cinfo, M_EOI);
+}
+
+
+/*
+ * Initialize the marker writer module.
+ */
+
+GLOBAL(void)
+jinit_marker_writer(j_compress_ptr cinfo)
+{
+ my_marker_ptr marker;
+
+ /* Create the subobject */
+ marker = (my_marker_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_marker_writer));
+ cinfo->marker = (struct jpeg_marker_writer *)marker;
+ /* Initialize method pointers */
+ marker->pub.write_file_header = write_file_header;
+ marker->pub.write_frame_header = write_frame_header;
+ marker->pub.write_scan_header = write_scan_header;
+ marker->pub.write_file_trailer = write_file_trailer;
+ marker->pub.write_tables_only = write_tables_only;
+ marker->pub.write_marker_header = write_marker_header;
+ marker->pub.write_marker_byte = write_marker_byte;
+ /* Initialize private state */
+ marker->last_restart_interval = 0;
+}
diff --git a/media/libjpeg/jcmaster.c b/media/libjpeg/jcmaster.c
new file mode 100644
index 0000000000..b821710ac3
--- /dev/null
+++ b/media/libjpeg/jcmaster.c
@@ -0,0 +1,639 @@
+/*
+ * jcmaster.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2003-2010 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010, 2016, 2018, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains master control logic for the JPEG compressor.
+ * These routines are concerned with parameter validation, initial setup,
+ * and inter-pass control (determining the number of passes and the work
+ * to be done in each pass).
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jpegcomp.h"
+
+
+/* Private state */
+
+typedef enum {
+ main_pass, /* input data, also do first output step */
+ huff_opt_pass, /* Huffman code optimization pass */
+ output_pass /* data output pass */
+} c_pass_type;
+
+typedef struct {
+ struct jpeg_comp_master pub; /* public fields */
+
+ c_pass_type pass_type; /* the type of the current pass */
+
+ int pass_number; /* # of passes completed */
+ int total_passes; /* total # of passes needed */
+
+ int scan_number; /* current index in scan_info[] */
+
+ /*
+ * This is here so we can add libjpeg-turbo version/build information to the
+ * global string table without introducing a new global symbol. Adding this
+ * information to the global string table allows one to examine a binary
+ * object and determine which version of libjpeg-turbo it was built from or
+ * linked against.
+ */
+ const char *jpeg_version;
+
+} my_comp_master;
+
+typedef my_comp_master *my_master_ptr;
+
+
+/*
+ * Support routines that do various essential calculations.
+ */
+
+#if JPEG_LIB_VERSION >= 70
+/*
+ * Compute JPEG image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo)
+/* Do computations that are needed before master selection phase */
+{
+ /* Hardwire it to "no scaling" */
+ cinfo->jpeg_width = cinfo->image_width;
+ cinfo->jpeg_height = cinfo->image_height;
+ cinfo->min_DCT_h_scaled_size = DCTSIZE;
+ cinfo->min_DCT_v_scaled_size = DCTSIZE;
+}
+#endif
+
+
+LOCAL(void)
+initial_setup(j_compress_ptr cinfo, boolean transcode_only)
+/* Do computations that are needed before master selection phase */
+{
+ int ci;
+ jpeg_component_info *compptr;
+ long samplesperrow;
+ JDIMENSION jd_samplesperrow;
+
+#if JPEG_LIB_VERSION >= 70
+#if JPEG_LIB_VERSION >= 80
+ if (!transcode_only)
+#endif
+ jpeg_calc_jpeg_dimensions(cinfo);
+#endif
+
+ /* Sanity check on image dimensions */
+ if (cinfo->_jpeg_height <= 0 || cinfo->_jpeg_width <= 0 ||
+ cinfo->num_components <= 0 || cinfo->input_components <= 0)
+ ERREXIT(cinfo, JERR_EMPTY_IMAGE);
+
+ /* Make sure image isn't bigger than I can handle */
+ if ((long)cinfo->_jpeg_height > (long)JPEG_MAX_DIMENSION ||
+ (long)cinfo->_jpeg_width > (long)JPEG_MAX_DIMENSION)
+ ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)JPEG_MAX_DIMENSION);
+
+ /* Width of an input scanline must be representable as JDIMENSION. */
+ samplesperrow = (long)cinfo->image_width * (long)cinfo->input_components;
+ jd_samplesperrow = (JDIMENSION)samplesperrow;
+ if ((long)jd_samplesperrow != samplesperrow)
+ ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+
+ /* For now, precision must match compiled-in value... */
+ if (cinfo->data_precision != BITS_IN_JSAMPLE)
+ ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
+ /* Check that number of components won't exceed internal array sizes */
+ if (cinfo->num_components > MAX_COMPONENTS)
+ ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
+ MAX_COMPONENTS);
+
+ /* Compute maximum sampling factors; check factor validity */
+ cinfo->max_h_samp_factor = 1;
+ cinfo->max_v_samp_factor = 1;
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ if (compptr->h_samp_factor <= 0 ||
+ compptr->h_samp_factor > MAX_SAMP_FACTOR ||
+ compptr->v_samp_factor <= 0 ||
+ compptr->v_samp_factor > MAX_SAMP_FACTOR)
+ ERREXIT(cinfo, JERR_BAD_SAMPLING);
+ cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
+ compptr->h_samp_factor);
+ cinfo->max_v_samp_factor = MAX(cinfo->max_v_samp_factor,
+ compptr->v_samp_factor);
+ }
+
+ /* Compute dimensions of components */
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ /* Fill in the correct component_index value; don't rely on application */
+ compptr->component_index = ci;
+ /* For compression, we never do DCT scaling. */
+#if JPEG_LIB_VERSION >= 70
+ compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = DCTSIZE;
+#else
+ compptr->DCT_scaled_size = DCTSIZE;
+#endif
+ /* Size in DCT blocks */
+ compptr->width_in_blocks = (JDIMENSION)
+ jdiv_round_up((long)cinfo->_jpeg_width * (long)compptr->h_samp_factor,
+ (long)(cinfo->max_h_samp_factor * DCTSIZE));
+ compptr->height_in_blocks = (JDIMENSION)
+ jdiv_round_up((long)cinfo->_jpeg_height * (long)compptr->v_samp_factor,
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
+ /* Size in samples */
+ compptr->downsampled_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->_jpeg_width * (long)compptr->h_samp_factor,
+ (long)cinfo->max_h_samp_factor);
+ compptr->downsampled_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->_jpeg_height * (long)compptr->v_samp_factor,
+ (long)cinfo->max_v_samp_factor);
+ /* Mark component needed (this flag isn't actually used for compression) */
+ compptr->component_needed = TRUE;
+ }
+
+ /* Compute number of fully interleaved MCU rows (number of times that
+ * main controller will call coefficient controller).
+ */
+ cinfo->total_iMCU_rows = (JDIMENSION)
+ jdiv_round_up((long)cinfo->_jpeg_height,
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
+}
+
+
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+
+LOCAL(void)
+validate_script(j_compress_ptr cinfo)
+/* Verify that the scan script in cinfo->scan_info[] is valid; also
+ * determine whether it uses progressive JPEG, and set cinfo->progressive_mode.
+ */
+{
+ const jpeg_scan_info *scanptr;
+ int scanno, ncomps, ci, coefi, thisi;
+ int Ss, Se, Ah, Al;
+ boolean component_sent[MAX_COMPONENTS];
+#ifdef C_PROGRESSIVE_SUPPORTED
+ int *last_bitpos_ptr;
+ int last_bitpos[MAX_COMPONENTS][DCTSIZE2];
+ /* -1 until that coefficient has been seen; then last Al for it */
+#endif
+
+ if (cinfo->num_scans <= 0)
+ ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, 0);
+
+ /* For sequential JPEG, all scans must have Ss=0, Se=DCTSIZE2-1;
+ * for progressive JPEG, no scan can have this.
+ */
+ scanptr = cinfo->scan_info;
+ if (scanptr->Ss != 0 || scanptr->Se != DCTSIZE2 - 1) {
+#ifdef C_PROGRESSIVE_SUPPORTED
+ cinfo->progressive_mode = TRUE;
+ last_bitpos_ptr = &last_bitpos[0][0];
+ for (ci = 0; ci < cinfo->num_components; ci++)
+ for (coefi = 0; coefi < DCTSIZE2; coefi++)
+ *last_bitpos_ptr++ = -1;
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+ } else {
+ cinfo->progressive_mode = FALSE;
+ for (ci = 0; ci < cinfo->num_components; ci++)
+ component_sent[ci] = FALSE;
+ }
+
+ for (scanno = 1; scanno <= cinfo->num_scans; scanptr++, scanno++) {
+ /* Validate component indexes */
+ ncomps = scanptr->comps_in_scan;
+ if (ncomps <= 0 || ncomps > MAX_COMPS_IN_SCAN)
+ ERREXIT2(cinfo, JERR_COMPONENT_COUNT, ncomps, MAX_COMPS_IN_SCAN);
+ for (ci = 0; ci < ncomps; ci++) {
+ thisi = scanptr->component_index[ci];
+ if (thisi < 0 || thisi >= cinfo->num_components)
+ ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
+ /* Components must appear in SOF order within each scan */
+ if (ci > 0 && thisi <= scanptr->component_index[ci - 1])
+ ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
+ }
+ /* Validate progression parameters */
+ Ss = scanptr->Ss;
+ Se = scanptr->Se;
+ Ah = scanptr->Ah;
+ Al = scanptr->Al;
+ if (cinfo->progressive_mode) {
+#ifdef C_PROGRESSIVE_SUPPORTED
+ /* Rec. ITU-T T.81 | ISO/IEC 10918-1 simply gives the ranges 0..13 for Ah
+ * and Al, but that seems wrong: the upper bound ought to depend on data
+ * precision. Perhaps they really meant 0..N+1 for N-bit precision.
+ * Here we allow 0..10 for 8-bit data; Al larger than 10 results in
+ * out-of-range reconstructed DC values during the first DC scan,
+ * which might cause problems for some decoders.
+ */
+#if BITS_IN_JSAMPLE == 8
+#define MAX_AH_AL 10
+#else
+#define MAX_AH_AL 13
+#endif
+ if (Ss < 0 || Ss >= DCTSIZE2 || Se < Ss || Se >= DCTSIZE2 ||
+ Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL)
+ ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+ if (Ss == 0) {
+ if (Se != 0) /* DC and AC together not OK */
+ ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+ } else {
+ if (ncomps != 1) /* AC scans must be for only one component */
+ ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+ }
+ for (ci = 0; ci < ncomps; ci++) {
+ last_bitpos_ptr = &last_bitpos[scanptr->component_index[ci]][0];
+ if (Ss != 0 && last_bitpos_ptr[0] < 0) /* AC without prior DC scan */
+ ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+ for (coefi = Ss; coefi <= Se; coefi++) {
+ if (last_bitpos_ptr[coefi] < 0) {
+ /* first scan of this coefficient */
+ if (Ah != 0)
+ ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+ } else {
+ /* not first scan */
+ if (Ah != last_bitpos_ptr[coefi] || Al != Ah - 1)
+ ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+ }
+ last_bitpos_ptr[coefi] = Al;
+ }
+ }
+#endif
+ } else {
+ /* For sequential JPEG, all progression parameters must be these: */
+ if (Ss != 0 || Se != DCTSIZE2 - 1 || Ah != 0 || Al != 0)
+ ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+ /* Make sure components are not sent twice */
+ for (ci = 0; ci < ncomps; ci++) {
+ thisi = scanptr->component_index[ci];
+ if (component_sent[thisi])
+ ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
+ component_sent[thisi] = TRUE;
+ }
+ }
+ }
+
+ /* Now verify that everything got sent. */
+ if (cinfo->progressive_mode) {
+#ifdef C_PROGRESSIVE_SUPPORTED
+ /* For progressive mode, we only check that at least some DC data
+ * got sent for each component; the spec does not require that all bits
+ * of all coefficients be transmitted. Would it be wiser to enforce
+ * transmission of all coefficient bits??
+ */
+ for (ci = 0; ci < cinfo->num_components; ci++) {
+ if (last_bitpos[ci][0] < 0)
+ ERREXIT(cinfo, JERR_MISSING_DATA);
+ }
+#endif
+ } else {
+ for (ci = 0; ci < cinfo->num_components; ci++) {
+ if (!component_sent[ci])
+ ERREXIT(cinfo, JERR_MISSING_DATA);
+ }
+ }
+}
+
+#endif /* C_MULTISCAN_FILES_SUPPORTED */
+
+
+LOCAL(void)
+select_scan_parameters(j_compress_ptr cinfo)
+/* Set up the scan parameters for the current scan */
+{
+ int ci;
+
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+ if (cinfo->scan_info != NULL) {
+ /* Prepare for current scan --- the script is already validated */
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+ const jpeg_scan_info *scanptr = cinfo->scan_info + master->scan_number;
+
+ cinfo->comps_in_scan = scanptr->comps_in_scan;
+ for (ci = 0; ci < scanptr->comps_in_scan; ci++) {
+ cinfo->cur_comp_info[ci] =
+ &cinfo->comp_info[scanptr->component_index[ci]];
+ }
+ cinfo->Ss = scanptr->Ss;
+ cinfo->Se = scanptr->Se;
+ cinfo->Ah = scanptr->Ah;
+ cinfo->Al = scanptr->Al;
+ } else
+#endif
+ {
+ /* Prepare for single sequential-JPEG scan containing all components */
+ if (cinfo->num_components > MAX_COMPS_IN_SCAN)
+ ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
+ MAX_COMPS_IN_SCAN);
+ cinfo->comps_in_scan = cinfo->num_components;
+ for (ci = 0; ci < cinfo->num_components; ci++) {
+ cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci];
+ }
+ cinfo->Ss = 0;
+ cinfo->Se = DCTSIZE2 - 1;
+ cinfo->Ah = 0;
+ cinfo->Al = 0;
+ }
+}
+
+
+LOCAL(void)
+per_scan_setup(j_compress_ptr cinfo)
+/* Do computations that are needed before processing a JPEG scan */
+/* cinfo->comps_in_scan and cinfo->cur_comp_info[] are already set */
+{
+ int ci, mcublks, tmp;
+ jpeg_component_info *compptr;
+
+ if (cinfo->comps_in_scan == 1) {
+
+ /* Noninterleaved (single-component) scan */
+ compptr = cinfo->cur_comp_info[0];
+
+ /* Overall image size in MCUs */
+ cinfo->MCUs_per_row = compptr->width_in_blocks;
+ cinfo->MCU_rows_in_scan = compptr->height_in_blocks;
+
+ /* For noninterleaved scan, always one block per MCU */
+ compptr->MCU_width = 1;
+ compptr->MCU_height = 1;
+ compptr->MCU_blocks = 1;
+ compptr->MCU_sample_width = DCTSIZE;
+ compptr->last_col_width = 1;
+ /* For noninterleaved scans, it is convenient to define last_row_height
+ * as the number of block rows present in the last iMCU row.
+ */
+ tmp = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
+ if (tmp == 0) tmp = compptr->v_samp_factor;
+ compptr->last_row_height = tmp;
+
+ /* Prepare array describing MCU composition */
+ cinfo->blocks_in_MCU = 1;
+ cinfo->MCU_membership[0] = 0;
+
+ } else {
+
+ /* Interleaved (multi-component) scan */
+ if (cinfo->comps_in_scan <= 0 || cinfo->comps_in_scan > MAX_COMPS_IN_SCAN)
+ ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->comps_in_scan,
+ MAX_COMPS_IN_SCAN);
+
+ /* Overall image size in MCUs */
+ cinfo->MCUs_per_row = (JDIMENSION)
+ jdiv_round_up((long)cinfo->_jpeg_width,
+ (long)(cinfo->max_h_samp_factor * DCTSIZE));
+ cinfo->MCU_rows_in_scan = (JDIMENSION)
+ jdiv_round_up((long)cinfo->_jpeg_height,
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
+
+ cinfo->blocks_in_MCU = 0;
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ /* Sampling factors give # of blocks of component in each MCU */
+ compptr->MCU_width = compptr->h_samp_factor;
+ compptr->MCU_height = compptr->v_samp_factor;
+ compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
+ compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE;
+ /* Figure number of non-dummy blocks in last MCU column & row */
+ tmp = (int)(compptr->width_in_blocks % compptr->MCU_width);
+ if (tmp == 0) tmp = compptr->MCU_width;
+ compptr->last_col_width = tmp;
+ tmp = (int)(compptr->height_in_blocks % compptr->MCU_height);
+ if (tmp == 0) tmp = compptr->MCU_height;
+ compptr->last_row_height = tmp;
+ /* Prepare array describing MCU composition */
+ mcublks = compptr->MCU_blocks;
+ if (cinfo->blocks_in_MCU + mcublks > C_MAX_BLOCKS_IN_MCU)
+ ERREXIT(cinfo, JERR_BAD_MCU_SIZE);
+ while (mcublks-- > 0) {
+ cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci;
+ }
+ }
+
+ }
+
+ /* Convert restart specified in rows to actual MCU count. */
+ /* Note that count must fit in 16 bits, so we provide limiting. */
+ if (cinfo->restart_in_rows > 0) {
+ long nominal = (long)cinfo->restart_in_rows * (long)cinfo->MCUs_per_row;
+ cinfo->restart_interval = (unsigned int)MIN(nominal, 65535L);
+ }
+}
+
+
+/*
+ * Per-pass setup.
+ * This is called at the beginning of each pass. We determine which modules
+ * will be active during this pass and give them appropriate start_pass calls.
+ * We also set is_last_pass to indicate whether any more passes will be
+ * required.
+ */
+
+METHODDEF(void)
+prepare_for_pass(j_compress_ptr cinfo)
+{
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+
+ switch (master->pass_type) {
+ case main_pass:
+ /* Initial pass: will collect input data, and do either Huffman
+ * optimization or data output for the first scan.
+ */
+ select_scan_parameters(cinfo);
+ per_scan_setup(cinfo);
+ if (!cinfo->raw_data_in) {
+ (*cinfo->cconvert->start_pass) (cinfo);
+ (*cinfo->downsample->start_pass) (cinfo);
+ (*cinfo->prep->start_pass) (cinfo, JBUF_PASS_THRU);
+ }
+ (*cinfo->fdct->start_pass) (cinfo);
+ (*cinfo->entropy->start_pass) (cinfo, cinfo->optimize_coding);
+ (*cinfo->coef->start_pass) (cinfo,
+ (master->total_passes > 1 ?
+ JBUF_SAVE_AND_PASS : JBUF_PASS_THRU));
+ (*cinfo->main->start_pass) (cinfo, JBUF_PASS_THRU);
+ if (cinfo->optimize_coding) {
+ /* No immediate data output; postpone writing frame/scan headers */
+ master->pub.call_pass_startup = FALSE;
+ } else {
+ /* Will write frame/scan headers at first jpeg_write_scanlines call */
+ master->pub.call_pass_startup = TRUE;
+ }
+ break;
+#ifdef ENTROPY_OPT_SUPPORTED
+ case huff_opt_pass:
+ /* Do Huffman optimization for a scan after the first one. */
+ select_scan_parameters(cinfo);
+ per_scan_setup(cinfo);
+ if (cinfo->Ss != 0 || cinfo->Ah == 0 || cinfo->arith_code) {
+ (*cinfo->entropy->start_pass) (cinfo, TRUE);
+ (*cinfo->coef->start_pass) (cinfo, JBUF_CRANK_DEST);
+ master->pub.call_pass_startup = FALSE;
+ break;
+ }
+ /* Special case: Huffman DC refinement scans need no Huffman table
+ * and therefore we can skip the optimization pass for them.
+ */
+ master->pass_type = output_pass;
+ master->pass_number++;
+#endif
+ FALLTHROUGH /*FALLTHROUGH*/
+ case output_pass:
+ /* Do a data-output pass. */
+ /* We need not repeat per-scan setup if prior optimization pass did it. */
+ if (!cinfo->optimize_coding) {
+ select_scan_parameters(cinfo);
+ per_scan_setup(cinfo);
+ }
+ (*cinfo->entropy->start_pass) (cinfo, FALSE);
+ (*cinfo->coef->start_pass) (cinfo, JBUF_CRANK_DEST);
+ /* We emit frame/scan headers now */
+ if (master->scan_number == 0)
+ (*cinfo->marker->write_frame_header) (cinfo);
+ (*cinfo->marker->write_scan_header) (cinfo);
+ master->pub.call_pass_startup = FALSE;
+ break;
+ default:
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+ }
+
+ master->pub.is_last_pass = (master->pass_number == master->total_passes - 1);
+
+ /* Set up progress monitor's pass info if present */
+ if (cinfo->progress != NULL) {
+ cinfo->progress->completed_passes = master->pass_number;
+ cinfo->progress->total_passes = master->total_passes;
+ }
+}
+
+
+/*
+ * Special start-of-pass hook.
+ * This is called by jpeg_write_scanlines if call_pass_startup is TRUE.
+ * In single-pass processing, we need this hook because we don't want to
+ * write frame/scan headers during jpeg_start_compress; we want to let the
+ * application write COM markers etc. between jpeg_start_compress and the
+ * jpeg_write_scanlines loop.
+ * In multi-pass processing, this routine is not used.
+ */
+
+METHODDEF(void)
+pass_startup(j_compress_ptr cinfo)
+{
+ cinfo->master->call_pass_startup = FALSE; /* reset flag so call only once */
+
+ (*cinfo->marker->write_frame_header) (cinfo);
+ (*cinfo->marker->write_scan_header) (cinfo);
+}
+
+
+/*
+ * Finish up at end of pass.
+ */
+
+METHODDEF(void)
+finish_pass_master(j_compress_ptr cinfo)
+{
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+
+ /* The entropy coder always needs an end-of-pass call,
+ * either to analyze statistics or to flush its output buffer.
+ */
+ (*cinfo->entropy->finish_pass) (cinfo);
+
+ /* Update state for next pass */
+ switch (master->pass_type) {
+ case main_pass:
+ /* next pass is either output of scan 0 (after optimization)
+ * or output of scan 1 (if no optimization).
+ */
+ master->pass_type = output_pass;
+ if (!cinfo->optimize_coding)
+ master->scan_number++;
+ break;
+ case huff_opt_pass:
+ /* next pass is always output of current scan */
+ master->pass_type = output_pass;
+ break;
+ case output_pass:
+ /* next pass is either optimization or output of next scan */
+ if (cinfo->optimize_coding)
+ master->pass_type = huff_opt_pass;
+ master->scan_number++;
+ break;
+ }
+
+ master->pass_number++;
+}
+
+
+/*
+ * Initialize master compression control.
+ */
+
+GLOBAL(void)
+jinit_c_master_control(j_compress_ptr cinfo, boolean transcode_only)
+{
+ my_master_ptr master;
+
+ master = (my_master_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_comp_master));
+ cinfo->master = (struct jpeg_comp_master *)master;
+ master->pub.prepare_for_pass = prepare_for_pass;
+ master->pub.pass_startup = pass_startup;
+ master->pub.finish_pass = finish_pass_master;
+ master->pub.is_last_pass = FALSE;
+
+ /* Validate parameters, determine derived values */
+ initial_setup(cinfo, transcode_only);
+
+ if (cinfo->scan_info != NULL) {
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+ validate_script(cinfo);
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+ } else {
+ cinfo->progressive_mode = FALSE;
+ cinfo->num_scans = 1;
+ }
+
+ if (cinfo->progressive_mode && !cinfo->arith_code) /* TEMPORARY HACK ??? */
+ cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */
+
+ /* Initialize my private state */
+ if (transcode_only) {
+ /* no main pass in transcoding */
+ if (cinfo->optimize_coding)
+ master->pass_type = huff_opt_pass;
+ else
+ master->pass_type = output_pass;
+ } else {
+ /* for normal compression, first pass is always this type: */
+ master->pass_type = main_pass;
+ }
+ master->scan_number = 0;
+ master->pass_number = 0;
+ if (cinfo->optimize_coding)
+ master->total_passes = cinfo->num_scans * 2;
+ else
+ master->total_passes = cinfo->num_scans;
+
+ master->jpeg_version = PACKAGE_NAME " version " VERSION " (build " BUILD ")";
+}
diff --git a/media/libjpeg/jcomapi.c b/media/libjpeg/jcomapi.c
new file mode 100644
index 0000000000..efbb8357b0
--- /dev/null
+++ b/media/libjpeg/jcomapi.c
@@ -0,0 +1,109 @@
+/*
+ * jcomapi.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains application interface routines that are used for both
+ * compression and decompression.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/*
+ * Abort processing of a JPEG compression or decompression operation,
+ * but don't destroy the object itself.
+ *
+ * For this, we merely clean up all the nonpermanent memory pools.
+ * Note that temp files (virtual arrays) are not allowed to belong to
+ * the permanent pool, so we will be able to close all temp files here.
+ * Closing a data source or destination, if necessary, is the application's
+ * responsibility.
+ */
+
+GLOBAL(void)
+jpeg_abort(j_common_ptr cinfo)
+{
+ int pool;
+
+ /* Do nothing if called on a not-initialized or destroyed JPEG object. */
+ if (cinfo->mem == NULL)
+ return;
+
+ /* Releasing pools in reverse order might help avoid fragmentation
+ * with some (brain-damaged) malloc libraries.
+ */
+ for (pool = JPOOL_NUMPOOLS - 1; pool > JPOOL_PERMANENT; pool--) {
+ (*cinfo->mem->free_pool) (cinfo, pool);
+ }
+
+ /* Reset overall state for possible reuse of object */
+ if (cinfo->is_decompressor) {
+ cinfo->global_state = DSTATE_START;
+ /* Try to keep application from accessing now-deleted marker list.
+ * A bit kludgy to do it here, but this is the most central place.
+ */
+ ((j_decompress_ptr)cinfo)->marker_list = NULL;
+ } else {
+ cinfo->global_state = CSTATE_START;
+ }
+}
+
+
+/*
+ * Destruction of a JPEG object.
+ *
+ * Everything gets deallocated except the master jpeg_compress_struct itself
+ * and the error manager struct. Both of these are supplied by the application
+ * and must be freed, if necessary, by the application. (Often they are on
+ * the stack and so don't need to be freed anyway.)
+ * Closing a data source or destination, if necessary, is the application's
+ * responsibility.
+ */
+
+GLOBAL(void)
+jpeg_destroy(j_common_ptr cinfo)
+{
+ /* We need only tell the memory manager to release everything. */
+ /* NB: mem pointer is NULL if memory mgr failed to initialize. */
+ if (cinfo->mem != NULL)
+ (*cinfo->mem->self_destruct) (cinfo);
+ cinfo->mem = NULL; /* be safe if jpeg_destroy is called twice */
+ cinfo->global_state = 0; /* mark it destroyed */
+}
+
+
+/*
+ * Convenience routines for allocating quantization and Huffman tables.
+ * (Would jutils.c be a more reasonable place to put these?)
+ */
+
+GLOBAL(JQUANT_TBL *)
+jpeg_alloc_quant_table(j_common_ptr cinfo)
+{
+ JQUANT_TBL *tbl;
+
+ tbl = (JQUANT_TBL *)
+ (*cinfo->mem->alloc_small) (cinfo, JPOOL_PERMANENT, sizeof(JQUANT_TBL));
+ tbl->sent_table = FALSE; /* make sure this is false in any new table */
+ return tbl;
+}
+
+
+GLOBAL(JHUFF_TBL *)
+jpeg_alloc_huff_table(j_common_ptr cinfo)
+{
+ JHUFF_TBL *tbl;
+
+ tbl = (JHUFF_TBL *)
+ (*cinfo->mem->alloc_small) (cinfo, JPOOL_PERMANENT, sizeof(JHUFF_TBL));
+ tbl->sent_table = FALSE; /* make sure this is false in any new table */
+ return tbl;
+}
diff --git a/media/libjpeg/jconfig.h b/media/libjpeg/jconfig.h
new file mode 100644
index 0000000000..b5ea80217e
--- /dev/null
+++ b/media/libjpeg/jconfig.h
@@ -0,0 +1,37 @@
+/* Version ID for the JPEG library.
+ * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
+ */
+#define JPEG_LIB_VERSION 62
+
+/* libjpeg-turbo version */
+#define LIBJPEG_TURBO_VERSION 2.1.5.1
+
+/* libjpeg-turbo version in integer form */
+#define LIBJPEG_TURBO_VERSION_NUMBER 2001005
+
+/* Support arithmetic encoding */
+/* #undef C_ARITH_CODING_SUPPORTED */
+
+/* Support arithmetic decoding */
+/* #undef D_ARITH_CODING_SUPPORTED */
+
+/* Support in-memory source/destination managers */
+#define MEM_SRCDST_SUPPORTED 1
+
+/* Use accelerated SIMD routines. */
+#define WITH_SIMD 1
+
+/*
+ * Define BITS_IN_JSAMPLE as either
+ * 8 for 8-bit sample values (the usual setting)
+ * 12 for 12-bit sample values
+ * Only 8 and 12 are legal data precisions for lossy JPEG according to the
+ * JPEG standard, and the IJG code does not support anything else!
+ * We do not support run-time selection of data precision, sorry.
+ */
+
+#define BITS_IN_JSAMPLE 8 /* use 8 or 12 */
+
+/* Define if your (broken) compiler shifts signed values as if they were
+ unsigned. */
+/* #undef RIGHT_SHIFT_IS_UNSIGNED */
diff --git a/media/libjpeg/jconfigint.h b/media/libjpeg/jconfigint.h
new file mode 100644
index 0000000000..5a1f305f30
--- /dev/null
+++ b/media/libjpeg/jconfigint.h
@@ -0,0 +1,54 @@
+/* libjpeg-turbo build number */
+#define BUILD "20230208"
+
+/* Need to use Mozilla-specific function inlining. */
+#include "mozilla/Attributes.h"
+#define INLINE MOZ_ALWAYS_INLINE
+
+/* How to obtain thread-local storage */
+#if defined(_MSC_VER)
+#define THREAD_LOCAL __declspec(thread)
+#else
+#define THREAD_LOCAL __thread
+#endif
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "libjpeg-turbo"
+
+/* Version number of package */
+#define VERSION "2.1.5.1"
+
+/* The size of `size_t', as computed by sizeof. */
+#ifdef HAVE_64BIT_BUILD
+#define SIZEOF_SIZE_T 8
+#else
+#define SIZEOF_SIZE_T 4
+#endif
+
+/* Define if your compiler has __builtin_ctzl() and sizeof(unsigned long) == sizeof(size_t). */
+#ifndef _MSC_VER
+#define HAVE_BUILTIN_CTZL 1
+#endif
+
+/* Define to 1 if you have the <intrin.h> header file. */
+#ifdef _MSC_VER
+#define HAVE_INTRIN_H 1
+#endif
+
+#if defined(_MSC_VER) && defined(HAVE_INTRIN_H)
+#if (SIZEOF_SIZE_T == 8)
+#define HAVE_BITSCANFORWARD64
+#elif (SIZEOF_SIZE_T == 4)
+#define HAVE_BITSCANFORWARD
+#endif
+#endif
+
+#if defined(__has_attribute)
+#if __has_attribute(fallthrough)
+#define FALLTHROUGH __attribute__((fallthrough));
+#else
+#define FALLTHROUGH
+#endif
+#else
+#define FALLTHROUGH
+#endif
diff --git a/media/libjpeg/jcparam.c b/media/libjpeg/jcparam.c
new file mode 100644
index 0000000000..5bc7174dcb
--- /dev/null
+++ b/media/libjpeg/jcparam.c
@@ -0,0 +1,541 @@
+/*
+ * jcparam.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2008 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009-2011, 2018, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains optional default-setting code for the JPEG compressor.
+ * Applications do not have to use this file, but those that don't use it
+ * must know a lot more about the innards of the JPEG code.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jstdhuff.c"
+
+
+/*
+ * Quantization table setup routines
+ */
+
+GLOBAL(void)
+jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+ const unsigned int *basic_table, int scale_factor,
+ boolean force_baseline)
+/* Define a quantization table equal to the basic_table times
+ * a scale factor (given as a percentage).
+ * If force_baseline is TRUE, the computed quantization table entries
+ * are limited to 1..255 for JPEG baseline compatibility.
+ */
+{
+ JQUANT_TBL **qtblptr;
+ int i;
+ long temp;
+
+ /* Safety check to ensure start_compress not called yet. */
+ if (cinfo->global_state != CSTATE_START)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ if (which_tbl < 0 || which_tbl >= NUM_QUANT_TBLS)
+ ERREXIT1(cinfo, JERR_DQT_INDEX, which_tbl);
+
+ qtblptr = &cinfo->quant_tbl_ptrs[which_tbl];
+
+ if (*qtblptr == NULL)
+ *qtblptr = jpeg_alloc_quant_table((j_common_ptr)cinfo);
+
+ for (i = 0; i < DCTSIZE2; i++) {
+ temp = ((long)basic_table[i] * scale_factor + 50L) / 100L;
+ /* limit the values to the valid range */
+ if (temp <= 0L) temp = 1L;
+ if (temp > 32767L) temp = 32767L; /* max quantizer needed for 12 bits */
+ if (force_baseline && temp > 255L)
+ temp = 255L; /* limit to baseline range if requested */
+ (*qtblptr)->quantval[i] = (UINT16)temp;
+ }
+
+ /* Initialize sent_table FALSE so table will be written to JPEG file. */
+ (*qtblptr)->sent_table = FALSE;
+}
+
+
+/* These are the sample quantization tables given in Annex K (Clause K.1) of
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ * The spec says that the values given produce "good" quality, and
+ * when divided by 2, "very good" quality.
+ */
+static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
+ 16, 11, 10, 16, 24, 40, 51, 61,
+ 12, 12, 14, 19, 26, 58, 60, 55,
+ 14, 13, 16, 24, 40, 57, 69, 56,
+ 14, 17, 22, 29, 51, 87, 80, 62,
+ 18, 22, 37, 56, 68, 109, 103, 77,
+ 24, 35, 55, 64, 81, 104, 113, 92,
+ 49, 64, 78, 87, 103, 121, 120, 101,
+ 72, 92, 95, 98, 112, 100, 103, 99
+};
+static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
+ 17, 18, 24, 47, 99, 99, 99, 99,
+ 18, 21, 26, 66, 99, 99, 99, 99,
+ 24, 26, 56, 99, 99, 99, 99, 99,
+ 47, 66, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99
+};
+
+
+#if JPEG_LIB_VERSION >= 70
+GLOBAL(void)
+jpeg_default_qtables(j_compress_ptr cinfo, boolean force_baseline)
+/* Set or change the 'quality' (quantization) setting, using default tables
+ * and straight percentage-scaling quality scales.
+ * This entry point allows different scalings for luminance and chrominance.
+ */
+{
+ /* Set up two quantization tables using the specified scaling */
+ jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+ cinfo->q_scale_factor[0], force_baseline);
+ jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+ cinfo->q_scale_factor[1], force_baseline);
+}
+#endif
+
+
+GLOBAL(void)
+jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+ boolean force_baseline)
+/* Set or change the 'quality' (quantization) setting, using default tables
+ * and a straight percentage-scaling quality scale. In most cases it's better
+ * to use jpeg_set_quality (below); this entry point is provided for
+ * applications that insist on a linear percentage scaling.
+ */
+{
+ /* Set up two quantization tables using the specified scaling */
+ jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+ scale_factor, force_baseline);
+ jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+ scale_factor, force_baseline);
+}
+
+
+GLOBAL(int)
+jpeg_quality_scaling(int quality)
+/* Convert a user-specified quality rating to a percentage scaling factor
+ * for an underlying quantization table, using our recommended scaling curve.
+ * The input 'quality' factor should be 0 (terrible) to 100 (very good).
+ */
+{
+ /* Safety limit on quality factor. Convert 0 to 1 to avoid zero divide. */
+ if (quality <= 0) quality = 1;
+ if (quality > 100) quality = 100;
+
+ /* The basic table is used as-is (scaling 100) for a quality of 50.
+ * Qualities 50..100 are converted to scaling percentage 200 - 2*Q;
+ * note that at Q=100 the scaling is 0, which will cause jpeg_add_quant_table
+ * to make all the table entries 1 (hence, minimum quantization loss).
+ * Qualities 1..50 are converted to scaling percentage 5000/Q.
+ */
+ if (quality < 50)
+ quality = 5000 / quality;
+ else
+ quality = 200 - quality * 2;
+
+ return quality;
+}
+
+
+GLOBAL(void)
+jpeg_set_quality(j_compress_ptr cinfo, int quality, boolean force_baseline)
+/* Set or change the 'quality' (quantization) setting, using default tables.
+ * This is the standard quality-adjusting entry point for typical user
+ * interfaces; only those who want detailed control over quantization tables
+ * would use the preceding three routines directly.
+ */
+{
+ /* Convert user 0-100 rating to percentage scaling */
+ quality = jpeg_quality_scaling(quality);
+
+ /* Set up standard quality tables */
+ jpeg_set_linear_quality(cinfo, quality, force_baseline);
+}
+
+
+/*
+ * Default parameter setup for compression.
+ *
+ * Applications that don't choose to use this routine must do their
+ * own setup of all these parameters. Alternately, you can call this
+ * to establish defaults and then alter parameters selectively. This
+ * is the recommended approach since, if we add any new parameters,
+ * your code will still work (they'll be set to reasonable defaults).
+ */
+
+GLOBAL(void)
+jpeg_set_defaults(j_compress_ptr cinfo)
+{
+ int i;
+
+ /* Safety check to ensure start_compress not called yet. */
+ if (cinfo->global_state != CSTATE_START)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ /* Allocate comp_info array large enough for maximum component count.
+ * Array is made permanent in case application wants to compress
+ * multiple images at same param settings.
+ */
+ if (cinfo->comp_info == NULL)
+ cinfo->comp_info = (jpeg_component_info *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+ MAX_COMPONENTS * sizeof(jpeg_component_info));
+
+ /* Initialize everything not dependent on the color space */
+
+#if JPEG_LIB_VERSION >= 70
+ cinfo->scale_num = 1; /* 1:1 scaling */
+ cinfo->scale_denom = 1;
+#endif
+ cinfo->data_precision = BITS_IN_JSAMPLE;
+ /* Set up two quantization tables using default quality of 75 */
+ jpeg_set_quality(cinfo, 75, TRUE);
+ /* Set up two Huffman tables */
+ std_huff_tables((j_common_ptr)cinfo);
+
+ /* Initialize default arithmetic coding conditioning */
+ for (i = 0; i < NUM_ARITH_TBLS; i++) {
+ cinfo->arith_dc_L[i] = 0;
+ cinfo->arith_dc_U[i] = 1;
+ cinfo->arith_ac_K[i] = 5;
+ }
+
+ /* Default is no multiple-scan output */
+ cinfo->scan_info = NULL;
+ cinfo->num_scans = 0;
+
+ /* Expect normal source image, not raw downsampled data */
+ cinfo->raw_data_in = FALSE;
+
+ /* Use Huffman coding, not arithmetic coding, by default */
+ cinfo->arith_code = FALSE;
+
+ /* By default, don't do extra passes to optimize entropy coding */
+ cinfo->optimize_coding = FALSE;
+ /* The standard Huffman tables are only valid for 8-bit data precision.
+ * If the precision is higher, force optimization on so that usable
+ * tables will be computed. This test can be removed if default tables
+ * are supplied that are valid for the desired precision.
+ */
+ if (cinfo->data_precision > 8)
+ cinfo->optimize_coding = TRUE;
+
+ /* By default, use the simpler non-cosited sampling alignment */
+ cinfo->CCIR601_sampling = FALSE;
+
+#if JPEG_LIB_VERSION >= 70
+ /* By default, apply fancy downsampling */
+ cinfo->do_fancy_downsampling = TRUE;
+#endif
+
+ /* No input smoothing */
+ cinfo->smoothing_factor = 0;
+
+ /* DCT algorithm preference */
+ cinfo->dct_method = JDCT_DEFAULT;
+
+ /* No restart markers */
+ cinfo->restart_interval = 0;
+ cinfo->restart_in_rows = 0;
+
+ /* Fill in default JFIF marker parameters. Note that whether the marker
+ * will actually be written is determined by jpeg_set_colorspace.
+ *
+ * By default, the library emits JFIF version code 1.01.
+ * An application that wants to emit JFIF 1.02 extension markers should set
+ * JFIF_minor_version to 2. We could probably get away with just defaulting
+ * to 1.02, but there may still be some decoders in use that will complain
+ * about that; saying 1.01 should minimize compatibility problems.
+ */
+ cinfo->JFIF_major_version = 1; /* Default JFIF version = 1.01 */
+ cinfo->JFIF_minor_version = 1;
+ cinfo->density_unit = 0; /* Pixel size is unknown by default */
+ cinfo->X_density = 1; /* Pixel aspect ratio is square by default */
+ cinfo->Y_density = 1;
+
+ /* Choose JPEG colorspace based on input space, set defaults accordingly */
+
+ jpeg_default_colorspace(cinfo);
+}
+
+
+/*
+ * Select an appropriate JPEG colorspace for in_color_space.
+ */
+
+GLOBAL(void)
+jpeg_default_colorspace(j_compress_ptr cinfo)
+{
+ switch (cinfo->in_color_space) {
+ case JCS_GRAYSCALE:
+ jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
+ break;
+ case JCS_RGB:
+ case JCS_EXT_RGB:
+ case JCS_EXT_RGBX:
+ case JCS_EXT_BGR:
+ case JCS_EXT_BGRX:
+ case JCS_EXT_XBGR:
+ case JCS_EXT_XRGB:
+ case JCS_EXT_RGBA:
+ case JCS_EXT_BGRA:
+ case JCS_EXT_ABGR:
+ case JCS_EXT_ARGB:
+ jpeg_set_colorspace(cinfo, JCS_YCbCr);
+ break;
+ case JCS_YCbCr:
+ jpeg_set_colorspace(cinfo, JCS_YCbCr);
+ break;
+ case JCS_CMYK:
+ jpeg_set_colorspace(cinfo, JCS_CMYK); /* By default, no translation */
+ break;
+ case JCS_YCCK:
+ jpeg_set_colorspace(cinfo, JCS_YCCK);
+ break;
+ case JCS_UNKNOWN:
+ jpeg_set_colorspace(cinfo, JCS_UNKNOWN);
+ break;
+ default:
+ ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+ }
+}
+
+
+/*
+ * Set the JPEG colorspace, and choose colorspace-dependent default values.
+ */
+
+GLOBAL(void)
+jpeg_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
+{
+ jpeg_component_info *compptr;
+ int ci;
+
+#define SET_COMP(index, id, hsamp, vsamp, quant, dctbl, actbl) \
+ (compptr = &cinfo->comp_info[index], \
+ compptr->component_id = (id), \
+ compptr->h_samp_factor = (hsamp), \
+ compptr->v_samp_factor = (vsamp), \
+ compptr->quant_tbl_no = (quant), \
+ compptr->dc_tbl_no = (dctbl), \
+ compptr->ac_tbl_no = (actbl) )
+
+ /* Safety check to ensure start_compress not called yet. */
+ if (cinfo->global_state != CSTATE_START)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ /* For all colorspaces, we use Q and Huff tables 0 for luminance components,
+ * tables 1 for chrominance components.
+ */
+
+ cinfo->jpeg_color_space = colorspace;
+
+ cinfo->write_JFIF_header = FALSE; /* No marker for non-JFIF colorspaces */
+ cinfo->write_Adobe_marker = FALSE; /* write no Adobe marker by default */
+
+ switch (colorspace) {
+ case JCS_GRAYSCALE:
+ cinfo->write_JFIF_header = TRUE; /* Write a JFIF marker */
+ cinfo->num_components = 1;
+ /* JFIF specifies component ID 1 */
+ SET_COMP(0, 1, 1, 1, 0, 0, 0);
+ break;
+ case JCS_RGB:
+ cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag RGB */
+ cinfo->num_components = 3;
+ SET_COMP(0, 0x52 /* 'R' */, 1, 1, 0, 0, 0);
+ SET_COMP(1, 0x47 /* 'G' */, 1, 1, 0, 0, 0);
+ SET_COMP(2, 0x42 /* 'B' */, 1, 1, 0, 0, 0);
+ break;
+ case JCS_YCbCr:
+ cinfo->write_JFIF_header = TRUE; /* Write a JFIF marker */
+ cinfo->num_components = 3;
+ /* JFIF specifies component IDs 1,2,3 */
+ /* We default to 2x2 subsamples of chrominance */
+ SET_COMP(0, 1, 2, 2, 0, 0, 0);
+ SET_COMP(1, 2, 1, 1, 1, 1, 1);
+ SET_COMP(2, 3, 1, 1, 1, 1, 1);
+ break;
+ case JCS_CMYK:
+ cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag CMYK */
+ cinfo->num_components = 4;
+ SET_COMP(0, 0x43 /* 'C' */, 1, 1, 0, 0, 0);
+ SET_COMP(1, 0x4D /* 'M' */, 1, 1, 0, 0, 0);
+ SET_COMP(2, 0x59 /* 'Y' */, 1, 1, 0, 0, 0);
+ SET_COMP(3, 0x4B /* 'K' */, 1, 1, 0, 0, 0);
+ break;
+ case JCS_YCCK:
+ cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag YCCK */
+ cinfo->num_components = 4;
+ SET_COMP(0, 1, 2, 2, 0, 0, 0);
+ SET_COMP(1, 2, 1, 1, 1, 1, 1);
+ SET_COMP(2, 3, 1, 1, 1, 1, 1);
+ SET_COMP(3, 4, 2, 2, 0, 0, 0);
+ break;
+ case JCS_UNKNOWN:
+ cinfo->num_components = cinfo->input_components;
+ if (cinfo->num_components < 1 || cinfo->num_components > MAX_COMPONENTS)
+ ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
+ MAX_COMPONENTS);
+ for (ci = 0; ci < cinfo->num_components; ci++) {
+ SET_COMP(ci, ci, 1, 1, 0, 0, 0);
+ }
+ break;
+ default:
+ ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
+ }
+}
+
+
+#ifdef C_PROGRESSIVE_SUPPORTED
+
+LOCAL(jpeg_scan_info *)
+fill_a_scan(jpeg_scan_info *scanptr, int ci, int Ss, int Se, int Ah, int Al)
+/* Support routine: generate one scan for specified component */
+{
+ scanptr->comps_in_scan = 1;
+ scanptr->component_index[0] = ci;
+ scanptr->Ss = Ss;
+ scanptr->Se = Se;
+ scanptr->Ah = Ah;
+ scanptr->Al = Al;
+ scanptr++;
+ return scanptr;
+}
+
+LOCAL(jpeg_scan_info *)
+fill_scans(jpeg_scan_info *scanptr, int ncomps, int Ss, int Se, int Ah, int Al)
+/* Support routine: generate one scan for each component */
+{
+ int ci;
+
+ for (ci = 0; ci < ncomps; ci++) {
+ scanptr->comps_in_scan = 1;
+ scanptr->component_index[0] = ci;
+ scanptr->Ss = Ss;
+ scanptr->Se = Se;
+ scanptr->Ah = Ah;
+ scanptr->Al = Al;
+ scanptr++;
+ }
+ return scanptr;
+}
+
+LOCAL(jpeg_scan_info *)
+fill_dc_scans(jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
+/* Support routine: generate interleaved DC scan if possible, else N scans */
+{
+ int ci;
+
+ if (ncomps <= MAX_COMPS_IN_SCAN) {
+ /* Single interleaved DC scan */
+ scanptr->comps_in_scan = ncomps;
+ for (ci = 0; ci < ncomps; ci++)
+ scanptr->component_index[ci] = ci;
+ scanptr->Ss = scanptr->Se = 0;
+ scanptr->Ah = Ah;
+ scanptr->Al = Al;
+ scanptr++;
+ } else {
+ /* Noninterleaved DC scan for each component */
+ scanptr = fill_scans(scanptr, ncomps, 0, 0, Ah, Al);
+ }
+ return scanptr;
+}
+
+
+/*
+ * Create a recommended progressive-JPEG script.
+ * cinfo->num_components and cinfo->jpeg_color_space must be correct.
+ */
+
+GLOBAL(void)
+jpeg_simple_progression(j_compress_ptr cinfo)
+{
+ int ncomps = cinfo->num_components;
+ int nscans;
+ jpeg_scan_info *scanptr;
+
+ /* Safety check to ensure start_compress not called yet. */
+ if (cinfo->global_state != CSTATE_START)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ /* Figure space needed for script. Calculation must match code below! */
+ if (ncomps == 3 && cinfo->jpeg_color_space == JCS_YCbCr) {
+ /* Custom script for YCbCr color images. */
+ nscans = 10;
+ } else {
+ /* All-purpose script for other color spaces. */
+ if (ncomps > MAX_COMPS_IN_SCAN)
+ nscans = 6 * ncomps; /* 2 DC + 4 AC scans per component */
+ else
+ nscans = 2 + 4 * ncomps; /* 2 DC scans; 4 AC scans per component */
+ }
+
+ /* Allocate space for script.
+ * We need to put it in the permanent pool in case the application performs
+ * multiple compressions without changing the settings. To avoid a memory
+ * leak if jpeg_simple_progression is called repeatedly for the same JPEG
+ * object, we try to re-use previously allocated space, and we allocate
+ * enough space to handle YCbCr even if initially asked for grayscale.
+ */
+ if (cinfo->script_space == NULL || cinfo->script_space_size < nscans) {
+ cinfo->script_space_size = MAX(nscans, 10);
+ cinfo->script_space = (jpeg_scan_info *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+ cinfo->script_space_size * sizeof(jpeg_scan_info));
+ }
+ scanptr = cinfo->script_space;
+ cinfo->scan_info = scanptr;
+ cinfo->num_scans = nscans;
+
+ if (ncomps == 3 && cinfo->jpeg_color_space == JCS_YCbCr) {
+ /* Custom script for YCbCr color images. */
+ /* Initial DC scan */
+ scanptr = fill_dc_scans(scanptr, ncomps, 0, 1);
+ /* Initial AC scan: get some luma data out in a hurry */
+ scanptr = fill_a_scan(scanptr, 0, 1, 5, 0, 2);
+ /* Chroma data is too small to be worth expending many scans on */
+ scanptr = fill_a_scan(scanptr, 2, 1, 63, 0, 1);
+ scanptr = fill_a_scan(scanptr, 1, 1, 63, 0, 1);
+ /* Complete spectral selection for luma AC */
+ scanptr = fill_a_scan(scanptr, 0, 6, 63, 0, 2);
+ /* Refine next bit of luma AC */
+ scanptr = fill_a_scan(scanptr, 0, 1, 63, 2, 1);
+ /* Finish DC successive approximation */
+ scanptr = fill_dc_scans(scanptr, ncomps, 1, 0);
+ /* Finish AC successive approximation */
+ scanptr = fill_a_scan(scanptr, 2, 1, 63, 1, 0);
+ scanptr = fill_a_scan(scanptr, 1, 1, 63, 1, 0);
+ /* Luma bottom bit comes last since it's usually largest scan */
+ scanptr = fill_a_scan(scanptr, 0, 1, 63, 1, 0);
+ } else {
+ /* All-purpose script for other color spaces. */
+ /* Successive approximation first pass */
+ scanptr = fill_dc_scans(scanptr, ncomps, 0, 1);
+ scanptr = fill_scans(scanptr, ncomps, 1, 5, 0, 2);
+ scanptr = fill_scans(scanptr, ncomps, 6, 63, 0, 2);
+ /* Successive approximation second pass */
+ scanptr = fill_scans(scanptr, ncomps, 1, 63, 2, 1);
+ /* Successive approximation final pass */
+ scanptr = fill_dc_scans(scanptr, ncomps, 1, 0);
+ scanptr = fill_scans(scanptr, ncomps, 1, 63, 1, 0);
+ }
+}
+
+#endif /* C_PROGRESSIVE_SUPPORTED */
diff --git a/media/libjpeg/jcphuff.c b/media/libjpeg/jcphuff.c
new file mode 100644
index 0000000000..5006b67075
--- /dev/null
+++ b/media/libjpeg/jcphuff.c
@@ -0,0 +1,1113 @@
+/*
+ * jcphuff.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1995-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2011, 2015, 2018, 2021-2022, D. R. Commander.
+ * Copyright (C) 2016, 2018, 2022, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ * Copyright (C) 2021, Alex Richardson.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains Huffman entropy encoding routines for progressive JPEG.
+ *
+ * We do not support output suspension in this module, since the library
+ * currently does not allow multiple-scan files to be written with output
+ * suspension.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jsimd.h"
+#include <limits.h>
+
+#ifdef HAVE_INTRIN_H
+#include <intrin.h>
+#ifdef _MSC_VER
+#ifdef HAVE_BITSCANFORWARD64
+#pragma intrinsic(_BitScanForward64)
+#endif
+#ifdef HAVE_BITSCANFORWARD
+#pragma intrinsic(_BitScanForward)
+#endif
+#endif
+#endif
+
+#ifdef C_PROGRESSIVE_SUPPORTED
+
+/*
+ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
+ * used for bit counting rather than the lookup table. This will reduce the
+ * memory footprint by 64k, which is important for some mobile applications
+ * that create many isolated instances of libjpeg-turbo (web browsers, for
+ * instance.) This may improve performance on some mobile platforms as well.
+ * This feature is enabled by default only on Arm processors, because some x86
+ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
+ * shown to have a significant performance impact even on the x86 chips that
+ * have a fast implementation of it. When building for Armv6, you can
+ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
+ * flags (this defines __thumb__).
+ */
+
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+ defined(_M_ARM) || defined(_M_ARM64)
+#if !defined(__thumb__) || defined(__thumb2__)
+#define USE_CLZ_INTRINSIC
+#endif
+#endif
+
+#ifdef USE_CLZ_INTRINSIC
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x) (32 - _CountLeadingZeros(x))
+#else
+#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
+#endif
+#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
+#else
+#include "jpeg_nbits_table.h"
+#define JPEG_NBITS(x) (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
+#endif
+
+
+/* Expanded entropy encoder object for progressive Huffman encoding. */
+
+typedef struct {
+ struct jpeg_entropy_encoder pub; /* public fields */
+
+ /* Pointer to routine to prepare data for encode_mcu_AC_first() */
+ void (*AC_first_prepare) (const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *values, size_t *zerobits);
+ /* Pointer to routine to prepare data for encode_mcu_AC_refine() */
+ int (*AC_refine_prepare) (const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *absvalues, size_t *bits);
+
+ /* Mode flag: TRUE for optimization, FALSE for actual data output */
+ boolean gather_statistics;
+
+ /* Bit-level coding status.
+ * next_output_byte/free_in_buffer are local copies of cinfo->dest fields.
+ */
+ JOCTET *next_output_byte; /* => next byte to write in buffer */
+ size_t free_in_buffer; /* # of byte spaces remaining in buffer */
+ size_t put_buffer; /* current bit-accumulation buffer */
+ int put_bits; /* # of bits now in it */
+ j_compress_ptr cinfo; /* link to cinfo (needed for dump_buffer) */
+
+ /* Coding status for DC components */
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+
+ /* Coding status for AC components */
+ int ac_tbl_no; /* the table number of the single component */
+ unsigned int EOBRUN; /* run length of EOBs */
+ unsigned int BE; /* # of buffered correction bits before MCU */
+ char *bit_buffer; /* buffer for correction bits (1 per char) */
+ /* packing correction bits tightly would save some space but cost time... */
+
+ unsigned int restarts_to_go; /* MCUs left in this restart interval */
+ int next_restart_num; /* next restart number to write (0-7) */
+
+ /* Pointers to derived tables (these workspaces have image lifespan).
+ * Since any one scan codes only DC or only AC, we only need one set
+ * of tables, not one for DC and one for AC.
+ */
+ c_derived_tbl *derived_tbls[NUM_HUFF_TBLS];
+
+ /* Statistics tables for optimization; again, one set is enough */
+ long *count_ptrs[NUM_HUFF_TBLS];
+} phuff_entropy_encoder;
+
+typedef phuff_entropy_encoder *phuff_entropy_ptr;
+
+/* MAX_CORR_BITS is the number of bits the AC refinement correction-bit
+ * buffer can hold. Larger sizes may slightly improve compression, but
+ * 1000 is already well into the realm of overkill.
+ * The minimum safe size is 64 bits.
+ */
+
+#define MAX_CORR_BITS 1000 /* Max # of correction bits I can buffer */
+
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than JLONG.
+ * We assume that int right shift is unsigned if JLONG right shift is,
+ * which should be safe.
+ */
+
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS int ishift_temp;
+#define IRIGHT_SHIFT(x, shft) \
+ ((ishift_temp = (x)) < 0 ? \
+ (ishift_temp >> (shft)) | ((~0) << (16 - (shft))) : \
+ (ishift_temp >> (shft)))
+#else
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x, shft) ((x) >> (shft))
+#endif
+
+#define PAD(v, p) ((v + (p) - 1) & (~((p) - 1)))
+
+/* Forward declarations */
+METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(void) encode_mcu_AC_first_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ UJCOEF *values, size_t *zerobits);
+METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(int) encode_mcu_AC_refine_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ UJCOEF *absvalues, size_t *bits);
+METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_phuff(j_compress_ptr cinfo);
+METHODDEF(void) finish_pass_gather_phuff(j_compress_ptr cinfo);
+
+
+/* Count bit loop zeroes */
+INLINE
+METHODDEF(int)
+count_zeroes(size_t *x)
+{
+#if defined(HAVE_BUILTIN_CTZL)
+ int result;
+ result = __builtin_ctzl(*x);
+ *x >>= result;
+#elif defined(HAVE_BITSCANFORWARD64)
+ unsigned long result;
+ _BitScanForward64(&result, *x);
+ *x >>= result;
+#elif defined(HAVE_BITSCANFORWARD)
+ unsigned long result;
+ _BitScanForward(&result, *x);
+ *x >>= result;
+#else
+ int result = 0;
+ while ((*x & 1) == 0) {
+ ++result;
+ *x >>= 1;
+ }
+#endif
+ return (int)result;
+}
+
+
+/*
+ * Initialize for a Huffman-compressed scan using progressive JPEG.
+ */
+
+METHODDEF(void)
+start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ boolean is_DC_band;
+ int ci, tbl;
+ jpeg_component_info *compptr;
+
+ entropy->cinfo = cinfo;
+ entropy->gather_statistics = gather_statistics;
+
+ is_DC_band = (cinfo->Ss == 0);
+
+ /* We assume jcmaster.c already validated the scan parameters. */
+
+ /* Select execution routines */
+ if (cinfo->Ah == 0) {
+ if (is_DC_band)
+ entropy->pub.encode_mcu = encode_mcu_DC_first;
+ else
+ entropy->pub.encode_mcu = encode_mcu_AC_first;
+ if (jsimd_can_encode_mcu_AC_first_prepare())
+ entropy->AC_first_prepare = jsimd_encode_mcu_AC_first_prepare;
+ else
+ entropy->AC_first_prepare = encode_mcu_AC_first_prepare;
+ } else {
+ if (is_DC_band)
+ entropy->pub.encode_mcu = encode_mcu_DC_refine;
+ else {
+ entropy->pub.encode_mcu = encode_mcu_AC_refine;
+ if (jsimd_can_encode_mcu_AC_refine_prepare())
+ entropy->AC_refine_prepare = jsimd_encode_mcu_AC_refine_prepare;
+ else
+ entropy->AC_refine_prepare = encode_mcu_AC_refine_prepare;
+ /* AC refinement needs a correction bit buffer */
+ if (entropy->bit_buffer == NULL)
+ entropy->bit_buffer = (char *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ MAX_CORR_BITS * sizeof(char));
+ }
+ }
+ if (gather_statistics)
+ entropy->pub.finish_pass = finish_pass_gather_phuff;
+ else
+ entropy->pub.finish_pass = finish_pass_phuff;
+
+ /* Only DC coefficients may be interleaved, so cinfo->comps_in_scan = 1
+ * for AC coefficients.
+ */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ /* Initialize DC predictions to 0 */
+ entropy->last_dc_val[ci] = 0;
+ /* Get table index */
+ if (is_DC_band) {
+ if (cinfo->Ah != 0) /* DC refinement needs no table */
+ continue;
+ tbl = compptr->dc_tbl_no;
+ } else {
+ entropy->ac_tbl_no = tbl = compptr->ac_tbl_no;
+ }
+ if (gather_statistics) {
+ /* Check for invalid table index */
+ /* (make_c_derived_tbl does this in the other path) */
+ if (tbl < 0 || tbl >= NUM_HUFF_TBLS)
+ ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tbl);
+ /* Allocate and zero the statistics tables */
+ /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
+ if (entropy->count_ptrs[tbl] == NULL)
+ entropy->count_ptrs[tbl] = (long *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ 257 * sizeof(long));
+ memset(entropy->count_ptrs[tbl], 0, 257 * sizeof(long));
+ } else {
+ /* Compute derived values for Huffman table */
+ /* We may do this more than once for a table, but it's not expensive */
+ jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl,
+ &entropy->derived_tbls[tbl]);
+ }
+ }
+
+ /* Initialize AC stuff */
+ entropy->EOBRUN = 0;
+ entropy->BE = 0;
+
+ /* Initialize bit buffer to empty */
+ entropy->put_buffer = 0;
+ entropy->put_bits = 0;
+
+ /* Initialize restart stuff */
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num = 0;
+}
+
+
+/* Outputting bytes to the file.
+ * NB: these must be called only when actually outputting,
+ * that is, entropy->gather_statistics == FALSE.
+ */
+
+/* Emit a byte */
+#define emit_byte(entropy, val) { \
+ *(entropy)->next_output_byte++ = (JOCTET)(val); \
+ if (--(entropy)->free_in_buffer == 0) \
+ dump_buffer(entropy); \
+}
+
+
+LOCAL(void)
+dump_buffer(phuff_entropy_ptr entropy)
+/* Empty the output buffer; we do not support suspension in this module. */
+{
+ struct jpeg_destination_mgr *dest = entropy->cinfo->dest;
+
+ if (!(*dest->empty_output_buffer) (entropy->cinfo))
+ ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
+ /* After a successful buffer dump, must reset buffer pointers */
+ entropy->next_output_byte = dest->next_output_byte;
+ entropy->free_in_buffer = dest->free_in_buffer;
+}
+
+
+/* Outputting bits to the file */
+
+/* Only the right 24 bits of put_buffer are used; the valid bits are
+ * left-justified in this part. At most 16 bits can be passed to emit_bits
+ * in one call, and we never retain more than 7 bits in put_buffer
+ * between calls, so 24 bits are sufficient.
+ */
+
+LOCAL(void)
+emit_bits(phuff_entropy_ptr entropy, unsigned int code, int size)
+/* Emit some bits, unless we are in gather mode */
+{
+ /* This routine is heavily used, so it's worth coding tightly. */
+ register size_t put_buffer = (size_t)code;
+ register int put_bits = entropy->put_bits;
+
+ /* if size is 0, caller used an invalid Huffman table entry */
+ if (size == 0)
+ ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
+
+ if (entropy->gather_statistics)
+ return; /* do nothing if we're only getting stats */
+
+ put_buffer &= (((size_t)1) << size) - 1; /* mask off any extra bits in code */
+
+ put_bits += size; /* new number of bits in buffer */
+
+ put_buffer <<= 24 - put_bits; /* align incoming bits */
+
+ put_buffer |= entropy->put_buffer; /* and merge with old buffer contents */
+
+ while (put_bits >= 8) {
+ int c = (int)((put_buffer >> 16) & 0xFF);
+
+ emit_byte(entropy, c);
+ if (c == 0xFF) { /* need to stuff a zero byte? */
+ emit_byte(entropy, 0);
+ }
+ put_buffer <<= 8;
+ put_bits -= 8;
+ }
+
+ entropy->put_buffer = put_buffer; /* update variables */
+ entropy->put_bits = put_bits;
+}
+
+
+LOCAL(void)
+flush_bits(phuff_entropy_ptr entropy)
+{
+ emit_bits(entropy, 0x7F, 7); /* fill any partial byte with ones */
+ entropy->put_buffer = 0; /* and reset bit-buffer to empty */
+ entropy->put_bits = 0;
+}
+
+
+/*
+ * Emit (or just count) a Huffman symbol.
+ */
+
+LOCAL(void)
+emit_symbol(phuff_entropy_ptr entropy, int tbl_no, int symbol)
+{
+ if (entropy->gather_statistics)
+ entropy->count_ptrs[tbl_no][symbol]++;
+ else {
+ c_derived_tbl *tbl = entropy->derived_tbls[tbl_no];
+ emit_bits(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]);
+ }
+}
+
+
+/*
+ * Emit bits from a correction bit buffer.
+ */
+
+LOCAL(void)
+emit_buffered_bits(phuff_entropy_ptr entropy, char *bufstart,
+ unsigned int nbits)
+{
+ if (entropy->gather_statistics)
+ return; /* no real work */
+
+ while (nbits > 0) {
+ emit_bits(entropy, (unsigned int)(*bufstart), 1);
+ bufstart++;
+ nbits--;
+ }
+}
+
+
+/*
+ * Emit any pending EOBRUN symbol.
+ */
+
+LOCAL(void)
+emit_eobrun(phuff_entropy_ptr entropy)
+{
+ register int temp, nbits;
+
+ if (entropy->EOBRUN > 0) { /* if there is any pending EOBRUN */
+ temp = entropy->EOBRUN;
+ nbits = JPEG_NBITS_NONZERO(temp) - 1;
+ /* safety check: shouldn't happen given limited correction-bit buffer */
+ if (nbits > 14)
+ ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
+
+ emit_symbol(entropy, entropy->ac_tbl_no, nbits << 4);
+ if (nbits)
+ emit_bits(entropy, entropy->EOBRUN, nbits);
+
+ entropy->EOBRUN = 0;
+
+ /* Emit any buffered correction bits */
+ emit_buffered_bits(entropy, entropy->bit_buffer, entropy->BE);
+ entropy->BE = 0;
+ }
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(void)
+emit_restart(phuff_entropy_ptr entropy, int restart_num)
+{
+ int ci;
+
+ emit_eobrun(entropy);
+
+ if (!entropy->gather_statistics) {
+ flush_bits(entropy);
+ emit_byte(entropy, 0xFF);
+ emit_byte(entropy, JPEG_RST0 + restart_num);
+ }
+
+ if (entropy->cinfo->Ss == 0) {
+ /* Re-initialize DC predictions to 0 */
+ for (ci = 0; ci < entropy->cinfo->comps_in_scan; ci++)
+ entropy->last_dc_val[ci] = 0;
+ } else {
+ /* Re-initialize all AC-related fields to 0 */
+ entropy->EOBRUN = 0;
+ entropy->BE = 0;
+ }
+}
+
+
+/*
+ * MCU encoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ register int temp, temp2, temp3;
+ register int nbits;
+ int blkn, ci;
+ int Al = cinfo->Al;
+ JBLOCKROW block;
+ jpeg_component_info *compptr;
+ ISHIFT_TEMPS
+
+ entropy->next_output_byte = cinfo->dest->next_output_byte;
+ entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval)
+ if (entropy->restarts_to_go == 0)
+ emit_restart(entropy, entropy->next_restart_num);
+
+ /* Encode the MCU data blocks */
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ block = MCU_data[blkn];
+ ci = cinfo->MCU_membership[blkn];
+ compptr = cinfo->cur_comp_info[ci];
+
+ /* Compute the DC value after the required point transform by Al.
+ * This is simply an arithmetic right shift.
+ */
+ temp2 = IRIGHT_SHIFT((int)((*block)[0]), Al);
+
+ /* DC differences are figured on the point-transformed values. */
+ temp = temp2 - entropy->last_dc_val[ci];
+ entropy->last_dc_val[ci] = temp2;
+
+ /* Encode the DC coefficient difference per section G.1.2.1 */
+
+ /* This is a well-known technique for obtaining the absolute value without
+ * a branch. It is derived from an assembly language technique presented
+ * in "How to Optimize for the Pentium Processors", Copyright (c) 1996,
+ * 1997 by Agner Fog.
+ */
+ temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+ temp ^= temp3;
+ temp -= temp3; /* temp is abs value of input */
+ /* For a negative input, want temp2 = bitwise complement of abs(input) */
+ temp2 = temp ^ temp3;
+
+ /* Find the number of bits needed for the magnitude of the coefficient */
+ nbits = JPEG_NBITS(temp);
+ /* Check for out-of-range coefficient values.
+ * Since we're encoding a difference, the range limit is twice as much.
+ */
+ if (nbits > MAX_COEF_BITS + 1)
+ ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+
+ /* Count/emit the Huffman-coded symbol for the number of bits */
+ emit_symbol(entropy, compptr->dc_tbl_no, nbits);
+
+ /* Emit that number of bits of the value, if positive, */
+ /* or the complement of its magnitude, if negative. */
+ if (nbits) /* emit_bits rejects calls with size 0 */
+ emit_bits(entropy, (unsigned int)temp2, nbits);
+ }
+
+ cinfo->dest->next_output_byte = entropy->next_output_byte;
+ cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+ /* Update restart-interval state too */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * Data preparation for encode_mcu_AC_first().
+ */
+
+#define COMPUTE_ABSVALUES_AC_FIRST(Sl) { \
+ for (k = 0; k < Sl; k++) { \
+ temp = block[jpeg_natural_order_start[k]]; \
+ if (temp == 0) \
+ continue; \
+ /* We must apply the point transform by Al. For AC coefficients this \
+ * is an integer division with rounding towards 0. To do this portably \
+ * in C, we shift after obtaining the absolute value; so the code is \
+ * interwoven with finding the abs value (temp) and output bits (temp2). \
+ */ \
+ temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
+ temp ^= temp2; \
+ temp -= temp2; /* temp is abs value of input */ \
+ temp >>= Al; /* apply the point transform */ \
+ /* Watch out for case that nonzero coef is zero after point transform */ \
+ if (temp == 0) \
+ continue; \
+ /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
+ temp2 ^= temp; \
+ values[k] = (UJCOEF)temp; \
+ values[k + DCTSIZE2] = (UJCOEF)temp2; \
+ zerobits |= ((size_t)1U) << k; \
+ } \
+}
+
+METHODDEF(void)
+encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *values, size_t *bits)
+{
+ register int k, temp, temp2;
+ size_t zerobits = 0U;
+ int Sl0 = Sl;
+
+#if SIZEOF_SIZE_T == 4
+ if (Sl0 > 32)
+ Sl0 = 32;
+#endif
+
+ COMPUTE_ABSVALUES_AC_FIRST(Sl0);
+
+ bits[0] = zerobits;
+#if SIZEOF_SIZE_T == 4
+ zerobits = 0U;
+
+ if (Sl > 32) {
+ Sl -= 32;
+ jpeg_natural_order_start += 32;
+ values += 32;
+
+ COMPUTE_ABSVALUES_AC_FIRST(Sl);
+ }
+ bits[1] = zerobits;
+#endif
+}
+
+/*
+ * MCU encoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+#define ENCODE_COEFS_AC_FIRST(label) { \
+ while (zerobits) { \
+ r = count_zeroes(&zerobits); \
+ cvalue += r; \
+label \
+ temp = cvalue[0]; \
+ temp2 = cvalue[DCTSIZE2]; \
+ \
+ /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
+ while (r > 15) { \
+ emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
+ r -= 16; \
+ } \
+ \
+ /* Find the number of bits needed for the magnitude of the coefficient */ \
+ nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */ \
+ /* Check for out-of-range coefficient values */ \
+ if (nbits > MAX_COEF_BITS) \
+ ERREXIT(cinfo, JERR_BAD_DCT_COEF); \
+ \
+ /* Count/emit Huffman symbol for run length / number of bits */ \
+ emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); \
+ \
+ /* Emit that number of bits of the value, if positive, */ \
+ /* or the complement of its magnitude, if negative. */ \
+ emit_bits(entropy, (unsigned int)temp2, nbits); \
+ \
+ cvalue++; \
+ zerobits >>= 1; \
+ } \
+}
+
+METHODDEF(boolean)
+encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ register int temp, temp2;
+ register int nbits, r;
+ int Sl = cinfo->Se - cinfo->Ss + 1;
+ int Al = cinfo->Al;
+ UJCOEF values_unaligned[2 * DCTSIZE2 + 15];
+ UJCOEF *values;
+ const UJCOEF *cvalue;
+ size_t zerobits;
+ size_t bits[8 / SIZEOF_SIZE_T];
+
+ entropy->next_output_byte = cinfo->dest->next_output_byte;
+ entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval)
+ if (entropy->restarts_to_go == 0)
+ emit_restart(entropy, entropy->next_restart_num);
+
+#ifdef WITH_SIMD
+ cvalue = values = (UJCOEF *)PAD((JUINTPTR)values_unaligned, 16);
+#else
+ /* Not using SIMD, so alignment is not needed */
+ cvalue = values = values_unaligned;
+#endif
+
+ /* Prepare data */
+ entropy->AC_first_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
+ Sl, Al, values, bits);
+
+ zerobits = bits[0];
+#if SIZEOF_SIZE_T == 4
+ zerobits |= bits[1];
+#endif
+
+ /* Emit any pending EOBRUN */
+ if (zerobits && (entropy->EOBRUN > 0))
+ emit_eobrun(entropy);
+
+#if SIZEOF_SIZE_T == 4
+ zerobits = bits[0];
+#endif
+
+ /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
+
+ ENCODE_COEFS_AC_FIRST((void)0;);
+
+#if SIZEOF_SIZE_T == 4
+ zerobits = bits[1];
+ if (zerobits) {
+ int diff = ((values + DCTSIZE2 / 2) - cvalue);
+ r = count_zeroes(&zerobits);
+ r += diff;
+ cvalue += r;
+ goto first_iter_ac_first;
+ }
+
+ ENCODE_COEFS_AC_FIRST(first_iter_ac_first:);
+#endif
+
+ if (cvalue < (values + Sl)) { /* If there are trailing zeroes, */
+ entropy->EOBRUN++; /* count an EOB */
+ if (entropy->EOBRUN == 0x7FFF)
+ emit_eobrun(entropy); /* force it out to avoid overflow */
+ }
+
+ cinfo->dest->next_output_byte = entropy->next_output_byte;
+ cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+ /* Update restart-interval state too */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * MCU encoding for DC successive approximation refinement scan.
+ * Note: we assume such scans can be multi-component, although the spec
+ * is not very clear on the point.
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ register int temp;
+ int blkn;
+ int Al = cinfo->Al;
+ JBLOCKROW block;
+
+ entropy->next_output_byte = cinfo->dest->next_output_byte;
+ entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval)
+ if (entropy->restarts_to_go == 0)
+ emit_restart(entropy, entropy->next_restart_num);
+
+ /* Encode the MCU data blocks */
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ block = MCU_data[blkn];
+
+ /* We simply emit the Al'th bit of the DC coefficient value. */
+ temp = (*block)[0];
+ emit_bits(entropy, (unsigned int)(temp >> Al), 1);
+ }
+
+ cinfo->dest->next_output_byte = entropy->next_output_byte;
+ cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+ /* Update restart-interval state too */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * Data preparation for encode_mcu_AC_refine().
+ */
+
+#define COMPUTE_ABSVALUES_AC_REFINE(Sl, koffset) { \
+ /* It is convenient to make a pre-pass to determine the transformed \
+ * coefficients' absolute values and the EOB position. \
+ */ \
+ for (k = 0; k < Sl; k++) { \
+ temp = block[jpeg_natural_order_start[k]]; \
+ /* We must apply the point transform by Al. For AC coefficients this \
+ * is an integer division with rounding towards 0. To do this portably \
+ * in C, we shift after obtaining the absolute value. \
+ */ \
+ temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
+ temp ^= temp2; \
+ temp -= temp2; /* temp is abs value of input */ \
+ temp >>= Al; /* apply the point transform */ \
+ if (temp != 0) { \
+ zerobits |= ((size_t)1U) << k; \
+ signbits |= ((size_t)(temp2 + 1)) << k; \
+ } \
+ absvalues[k] = (UJCOEF)temp; /* save abs value for main pass */ \
+ if (temp == 1) \
+ EOB = k + koffset; /* EOB = index of last newly-nonzero coef */ \
+ } \
+}
+
+METHODDEF(int)
+encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *absvalues, size_t *bits)
+{
+ register int k, temp, temp2;
+ int EOB = 0;
+ size_t zerobits = 0U, signbits = 0U;
+ int Sl0 = Sl;
+
+#if SIZEOF_SIZE_T == 4
+ if (Sl0 > 32)
+ Sl0 = 32;
+#endif
+
+ COMPUTE_ABSVALUES_AC_REFINE(Sl0, 0);
+
+ bits[0] = zerobits;
+#if SIZEOF_SIZE_T == 8
+ bits[1] = signbits;
+#else
+ bits[2] = signbits;
+
+ zerobits = 0U;
+ signbits = 0U;
+
+ if (Sl > 32) {
+ Sl -= 32;
+ jpeg_natural_order_start += 32;
+ absvalues += 32;
+
+ COMPUTE_ABSVALUES_AC_REFINE(Sl, 32);
+ }
+
+ bits[1] = zerobits;
+ bits[3] = signbits;
+#endif
+
+ return EOB;
+}
+
+
+/*
+ * MCU encoding for AC successive approximation refinement scan.
+ */
+
+#define ENCODE_COEFS_AC_REFINE(label) { \
+ while (zerobits) { \
+ idx = count_zeroes(&zerobits); \
+ r += idx; \
+ cabsvalue += idx; \
+ signbits >>= idx; \
+label \
+ /* Emit any required ZRLs, but not if they can be folded into EOB */ \
+ while (r > 15 && (cabsvalue <= EOBPTR)) { \
+ /* emit any pending EOBRUN and the BE correction bits */ \
+ emit_eobrun(entropy); \
+ /* Emit ZRL */ \
+ emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
+ r -= 16; \
+ /* Emit buffered correction bits that must be associated with ZRL */ \
+ emit_buffered_bits(entropy, BR_buffer, BR); \
+ BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
+ BR = 0; \
+ } \
+ \
+ temp = *cabsvalue++; \
+ \
+ /* If the coef was previously nonzero, it only needs a correction bit. \
+ * NOTE: a straight translation of the spec's figure G.7 would suggest \
+ * that we also need to test r > 15. But if r > 15, we can only get here \
+ * if k > EOB, which implies that this coefficient is not 1. \
+ */ \
+ if (temp > 1) { \
+ /* The correction bit is the next bit of the absolute value. */ \
+ BR_buffer[BR++] = (char)(temp & 1); \
+ signbits >>= 1; \
+ zerobits >>= 1; \
+ continue; \
+ } \
+ \
+ /* Emit any pending EOBRUN and the BE correction bits */ \
+ emit_eobrun(entropy); \
+ \
+ /* Count/emit Huffman symbol for run length / number of bits */ \
+ emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); \
+ \
+ /* Emit output bit for newly-nonzero coef */ \
+ temp = signbits & 1; /* ((*block)[jpeg_natural_order_start[k]] < 0) ? 0 : 1 */ \
+ emit_bits(entropy, (unsigned int)temp, 1); \
+ \
+ /* Emit buffered correction bits that must be associated with this code */ \
+ emit_buffered_bits(entropy, BR_buffer, BR); \
+ BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
+ BR = 0; \
+ r = 0; /* reset zero run length */ \
+ signbits >>= 1; \
+ zerobits >>= 1; \
+ } \
+}
+
+METHODDEF(boolean)
+encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ register int temp, r, idx;
+ char *BR_buffer;
+ unsigned int BR;
+ int Sl = cinfo->Se - cinfo->Ss + 1;
+ int Al = cinfo->Al;
+ UJCOEF absvalues_unaligned[DCTSIZE2 + 15];
+ UJCOEF *absvalues;
+ const UJCOEF *cabsvalue, *EOBPTR;
+ size_t zerobits, signbits;
+ size_t bits[16 / SIZEOF_SIZE_T];
+
+ entropy->next_output_byte = cinfo->dest->next_output_byte;
+ entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+ /* Emit restart marker if needed */
+ if (cinfo->restart_interval)
+ if (entropy->restarts_to_go == 0)
+ emit_restart(entropy, entropy->next_restart_num);
+
+#ifdef WITH_SIMD
+ cabsvalue = absvalues = (UJCOEF *)PAD((JUINTPTR)absvalues_unaligned, 16);
+#else
+ /* Not using SIMD, so alignment is not needed */
+ cabsvalue = absvalues = absvalues_unaligned;
+#endif
+
+ /* Prepare data */
+ EOBPTR = absvalues +
+ entropy->AC_refine_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
+ Sl, Al, absvalues, bits);
+
+ /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
+
+ r = 0; /* r = run length of zeros */
+ BR = 0; /* BR = count of buffered bits added now */
+ BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
+
+ zerobits = bits[0];
+#if SIZEOF_SIZE_T == 8
+ signbits = bits[1];
+#else
+ signbits = bits[2];
+#endif
+ ENCODE_COEFS_AC_REFINE((void)0;);
+
+#if SIZEOF_SIZE_T == 4
+ zerobits = bits[1];
+ signbits = bits[3];
+
+ if (zerobits) {
+ int diff = ((absvalues + DCTSIZE2 / 2) - cabsvalue);
+ idx = count_zeroes(&zerobits);
+ signbits >>= idx;
+ idx += diff;
+ r += idx;
+ cabsvalue += idx;
+ goto first_iter_ac_refine;
+ }
+
+ ENCODE_COEFS_AC_REFINE(first_iter_ac_refine:);
+#endif
+
+ r |= (int)((absvalues + Sl) - cabsvalue);
+
+ if (r > 0 || BR > 0) { /* If there are trailing zeroes, */
+ entropy->EOBRUN++; /* count an EOB */
+ entropy->BE += BR; /* concat my correction bits to older ones */
+ /* We force out the EOB if we risk either:
+ * 1. overflow of the EOB counter;
+ * 2. overflow of the correction bit buffer during the next MCU.
+ */
+ if (entropy->EOBRUN == 0x7FFF ||
+ entropy->BE > (MAX_CORR_BITS - DCTSIZE2 + 1))
+ emit_eobrun(entropy);
+ }
+
+ cinfo->dest->next_output_byte = entropy->next_output_byte;
+ cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+ /* Update restart-interval state too */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0) {
+ entropy->restarts_to_go = cinfo->restart_interval;
+ entropy->next_restart_num++;
+ entropy->next_restart_num &= 7;
+ }
+ entropy->restarts_to_go--;
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * Finish up at the end of a Huffman-compressed progressive scan.
+ */
+
+METHODDEF(void)
+finish_pass_phuff(j_compress_ptr cinfo)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+
+ entropy->next_output_byte = cinfo->dest->next_output_byte;
+ entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+ /* Flush out any buffered data */
+ emit_eobrun(entropy);
+ flush_bits(entropy);
+
+ cinfo->dest->next_output_byte = entropy->next_output_byte;
+ cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+}
+
+
+/*
+ * Finish up a statistics-gathering pass and create the new Huffman tables.
+ */
+
+METHODDEF(void)
+finish_pass_gather_phuff(j_compress_ptr cinfo)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ boolean is_DC_band;
+ int ci, tbl;
+ jpeg_component_info *compptr;
+ JHUFF_TBL **htblptr;
+ boolean did[NUM_HUFF_TBLS];
+
+ /* Flush out buffered data (all we care about is counting the EOB symbol) */
+ emit_eobrun(entropy);
+
+ is_DC_band = (cinfo->Ss == 0);
+
+ /* It's important not to apply jpeg_gen_optimal_table more than once
+ * per table, because it clobbers the input frequency counts!
+ */
+ memset(did, 0, sizeof(did));
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ if (is_DC_band) {
+ if (cinfo->Ah != 0) /* DC refinement needs no table */
+ continue;
+ tbl = compptr->dc_tbl_no;
+ } else {
+ tbl = compptr->ac_tbl_no;
+ }
+ if (!did[tbl]) {
+ if (is_DC_band)
+ htblptr = &cinfo->dc_huff_tbl_ptrs[tbl];
+ else
+ htblptr = &cinfo->ac_huff_tbl_ptrs[tbl];
+ if (*htblptr == NULL)
+ *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
+ jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]);
+ did[tbl] = TRUE;
+ }
+ }
+}
+
+
+/*
+ * Module initialization routine for progressive Huffman entropy encoding.
+ */
+
+GLOBAL(void)
+jinit_phuff_encoder(j_compress_ptr cinfo)
+{
+ phuff_entropy_ptr entropy;
+ int i;
+
+ entropy = (phuff_entropy_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(phuff_entropy_encoder));
+ cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
+ entropy->pub.start_pass = start_pass_phuff;
+
+ /* Mark tables unallocated */
+ for (i = 0; i < NUM_HUFF_TBLS; i++) {
+ entropy->derived_tbls[i] = NULL;
+ entropy->count_ptrs[i] = NULL;
+ }
+ entropy->bit_buffer = NULL; /* needed only in AC refinement scan */
+}
+
+#endif /* C_PROGRESSIVE_SUPPORTED */
diff --git a/media/libjpeg/jcprepct.c b/media/libjpeg/jcprepct.c
new file mode 100644
index 0000000000..f27cc34507
--- /dev/null
+++ b/media/libjpeg/jcprepct.c
@@ -0,0 +1,351 @@
+/*
+ * jcprepct.c
+ *
+ * This file is part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the compression preprocessing controller.
+ * This controller manages the color conversion, downsampling,
+ * and edge expansion steps.
+ *
+ * Most of the complexity here is associated with buffering input rows
+ * as required by the downsampler. See the comments at the head of
+ * jcsample.c for the downsampler's needs.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* At present, jcsample.c can request context rows only for smoothing.
+ * In the future, we might also need context rows for CCIR601 sampling
+ * or other more-complex downsampling procedures. The code to support
+ * context rows should be compiled only if needed.
+ */
+#ifdef INPUT_SMOOTHING_SUPPORTED
+#define CONTEXT_ROWS_SUPPORTED
+#endif
+
+
+/*
+ * For the simple (no-context-row) case, we just need to buffer one
+ * row group's worth of pixels for the downsampling step. At the bottom of
+ * the image, we pad to a full row group by replicating the last pixel row.
+ * The downsampler's last output row is then replicated if needed to pad
+ * out to a full iMCU row.
+ *
+ * When providing context rows, we must buffer three row groups' worth of
+ * pixels. Three row groups are physically allocated, but the row pointer
+ * arrays are made five row groups high, with the extra pointers above and
+ * below "wrapping around" to point to the last and first real row groups.
+ * This allows the downsampler to access the proper context rows.
+ * At the top and bottom of the image, we create dummy context rows by
+ * copying the first or last real pixel row. This copying could be avoided
+ * by pointer hacking as is done in jdmainct.c, but it doesn't seem worth the
+ * trouble on the compression side.
+ */
+
+
+/* Private buffer controller object */
+
+typedef struct {
+ struct jpeg_c_prep_controller pub; /* public fields */
+
+ /* Downsampling input buffer. This buffer holds color-converted data
+ * until we have enough to do a downsample step.
+ */
+ JSAMPARRAY color_buf[MAX_COMPONENTS];
+
+ JDIMENSION rows_to_go; /* counts rows remaining in source image */
+ int next_buf_row; /* index of next row to store in color_buf */
+
+#ifdef CONTEXT_ROWS_SUPPORTED /* only needed for context case */
+ int this_row_group; /* starting row index of group to process */
+ int next_buf_stop; /* downsample when we reach this index */
+#endif
+} my_prep_controller;
+
+typedef my_prep_controller *my_prep_ptr;
+
+
+/*
+ * Initialize for a processing pass.
+ */
+
+METHODDEF(void)
+start_pass_prep(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+{
+ my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
+
+ if (pass_mode != JBUF_PASS_THRU)
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+
+ /* Initialize total-height counter for detecting bottom of image */
+ prep->rows_to_go = cinfo->image_height;
+ /* Mark the conversion buffer empty */
+ prep->next_buf_row = 0;
+#ifdef CONTEXT_ROWS_SUPPORTED
+ /* Preset additional state variables for context mode.
+ * These aren't used in non-context mode, so we needn't test which mode.
+ */
+ prep->this_row_group = 0;
+ /* Set next_buf_stop to stop after two row groups have been read in. */
+ prep->next_buf_stop = 2 * cinfo->max_v_samp_factor;
+#endif
+}
+
+
+/*
+ * Expand an image vertically from height input_rows to height output_rows,
+ * by duplicating the bottom row.
+ */
+
+LOCAL(void)
+expand_bottom_edge(JSAMPARRAY image_data, JDIMENSION num_cols, int input_rows,
+ int output_rows)
+{
+ register int row;
+
+ for (row = input_rows; row < output_rows; row++) {
+ jcopy_sample_rows(image_data, input_rows - 1, image_data, row, 1,
+ num_cols);
+ }
+}
+
+
+/*
+ * Process some data in the simple no-context case.
+ *
+ * Preprocessor output data is counted in "row groups". A row group
+ * is defined to be v_samp_factor sample rows of each component.
+ * Downsampling will produce this much data from each max_v_samp_factor
+ * input rows.
+ */
+
+METHODDEF(void)
+pre_process_data(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
+ JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+ JDIMENSION out_row_groups_avail)
+{
+ my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
+ int numrows, ci;
+ JDIMENSION inrows;
+ jpeg_component_info *compptr;
+
+ while (*in_row_ctr < in_rows_avail &&
+ *out_row_group_ctr < out_row_groups_avail) {
+ /* Do color conversion to fill the conversion buffer. */
+ inrows = in_rows_avail - *in_row_ctr;
+ numrows = cinfo->max_v_samp_factor - prep->next_buf_row;
+ numrows = (int)MIN((JDIMENSION)numrows, inrows);
+ (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
+ prep->color_buf,
+ (JDIMENSION)prep->next_buf_row,
+ numrows);
+ *in_row_ctr += numrows;
+ prep->next_buf_row += numrows;
+ prep->rows_to_go -= numrows;
+ /* If at bottom of image, pad to fill the conversion buffer. */
+ if (prep->rows_to_go == 0 &&
+ prep->next_buf_row < cinfo->max_v_samp_factor) {
+ for (ci = 0; ci < cinfo->num_components; ci++) {
+ expand_bottom_edge(prep->color_buf[ci], cinfo->image_width,
+ prep->next_buf_row, cinfo->max_v_samp_factor);
+ }
+ prep->next_buf_row = cinfo->max_v_samp_factor;
+ }
+ /* If we've filled the conversion buffer, empty it. */
+ if (prep->next_buf_row == cinfo->max_v_samp_factor) {
+ (*cinfo->downsample->downsample) (cinfo,
+ prep->color_buf, (JDIMENSION)0,
+ output_buf, *out_row_group_ctr);
+ prep->next_buf_row = 0;
+ (*out_row_group_ctr)++;
+ }
+ /* If at bottom of image, pad the output to a full iMCU height.
+ * Note we assume the caller is providing a one-iMCU-height output buffer!
+ */
+ if (prep->rows_to_go == 0 && *out_row_group_ctr < out_row_groups_avail) {
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ expand_bottom_edge(output_buf[ci], compptr->width_in_blocks * DCTSIZE,
+ (int)(*out_row_group_ctr * compptr->v_samp_factor),
+ (int)(out_row_groups_avail * compptr->v_samp_factor));
+ }
+ *out_row_group_ctr = out_row_groups_avail;
+ break; /* can exit outer loop without test */
+ }
+ }
+}
+
+
+#ifdef CONTEXT_ROWS_SUPPORTED
+
+/*
+ * Process some data in the context case.
+ */
+
+METHODDEF(void)
+pre_process_context(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
+ JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+ JDIMENSION out_row_groups_avail)
+{
+ my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
+ int numrows, ci;
+ int buf_height = cinfo->max_v_samp_factor * 3;
+ JDIMENSION inrows;
+
+ while (*out_row_group_ctr < out_row_groups_avail) {
+ if (*in_row_ctr < in_rows_avail) {
+ /* Do color conversion to fill the conversion buffer. */
+ inrows = in_rows_avail - *in_row_ctr;
+ numrows = prep->next_buf_stop - prep->next_buf_row;
+ numrows = (int)MIN((JDIMENSION)numrows, inrows);
+ (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
+ prep->color_buf,
+ (JDIMENSION)prep->next_buf_row,
+ numrows);
+ /* Pad at top of image, if first time through */
+ if (prep->rows_to_go == cinfo->image_height) {
+ for (ci = 0; ci < cinfo->num_components; ci++) {
+ int row;
+ for (row = 1; row <= cinfo->max_v_samp_factor; row++) {
+ jcopy_sample_rows(prep->color_buf[ci], 0, prep->color_buf[ci],
+ -row, 1, cinfo->image_width);
+ }
+ }
+ }
+ *in_row_ctr += numrows;
+ prep->next_buf_row += numrows;
+ prep->rows_to_go -= numrows;
+ } else {
+ /* Return for more data, unless we are at the bottom of the image. */
+ if (prep->rows_to_go != 0)
+ break;
+ /* When at bottom of image, pad to fill the conversion buffer. */
+ if (prep->next_buf_row < prep->next_buf_stop) {
+ for (ci = 0; ci < cinfo->num_components; ci++) {
+ expand_bottom_edge(prep->color_buf[ci], cinfo->image_width,
+ prep->next_buf_row, prep->next_buf_stop);
+ }
+ prep->next_buf_row = prep->next_buf_stop;
+ }
+ }
+ /* If we've gotten enough data, downsample a row group. */
+ if (prep->next_buf_row == prep->next_buf_stop) {
+ (*cinfo->downsample->downsample) (cinfo, prep->color_buf,
+ (JDIMENSION)prep->this_row_group,
+ output_buf, *out_row_group_ctr);
+ (*out_row_group_ctr)++;
+ /* Advance pointers with wraparound as necessary. */
+ prep->this_row_group += cinfo->max_v_samp_factor;
+ if (prep->this_row_group >= buf_height)
+ prep->this_row_group = 0;
+ if (prep->next_buf_row >= buf_height)
+ prep->next_buf_row = 0;
+ prep->next_buf_stop = prep->next_buf_row + cinfo->max_v_samp_factor;
+ }
+ }
+}
+
+
+/*
+ * Create the wrapped-around downsampling input buffer needed for context mode.
+ */
+
+LOCAL(void)
+create_context_buffer(j_compress_ptr cinfo)
+{
+ my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
+ int rgroup_height = cinfo->max_v_samp_factor;
+ int ci, i;
+ jpeg_component_info *compptr;
+ JSAMPARRAY true_buffer, fake_buffer;
+
+ /* Grab enough space for fake row pointers for all the components;
+ * we need five row groups' worth of pointers for each component.
+ */
+ fake_buffer = (JSAMPARRAY)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (cinfo->num_components * 5 * rgroup_height) *
+ sizeof(JSAMPROW));
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ /* Allocate the actual buffer space (3 row groups) for this component.
+ * We make the buffer wide enough to allow the downsampler to edge-expand
+ * horizontally within the buffer, if it so chooses.
+ */
+ true_buffer = (*cinfo->mem->alloc_sarray)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (JDIMENSION)(((long)compptr->width_in_blocks * DCTSIZE *
+ cinfo->max_h_samp_factor) / compptr->h_samp_factor),
+ (JDIMENSION)(3 * rgroup_height));
+ /* Copy true buffer row pointers into the middle of the fake row array */
+ memcpy(fake_buffer + rgroup_height, true_buffer,
+ 3 * rgroup_height * sizeof(JSAMPROW));
+ /* Fill in the above and below wraparound pointers */
+ for (i = 0; i < rgroup_height; i++) {
+ fake_buffer[i] = true_buffer[2 * rgroup_height + i];
+ fake_buffer[4 * rgroup_height + i] = true_buffer[i];
+ }
+ prep->color_buf[ci] = fake_buffer + rgroup_height;
+ fake_buffer += 5 * rgroup_height; /* point to space for next component */
+ }
+}
+
+#endif /* CONTEXT_ROWS_SUPPORTED */
+
+
+/*
+ * Initialize preprocessing controller.
+ */
+
+GLOBAL(void)
+jinit_c_prep_controller(j_compress_ptr cinfo, boolean need_full_buffer)
+{
+ my_prep_ptr prep;
+ int ci;
+ jpeg_component_info *compptr;
+
+ if (need_full_buffer) /* safety check */
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+
+ prep = (my_prep_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_prep_controller));
+ cinfo->prep = (struct jpeg_c_prep_controller *)prep;
+ prep->pub.start_pass = start_pass_prep;
+
+ /* Allocate the color conversion buffer.
+ * We make the buffer wide enough to allow the downsampler to edge-expand
+ * horizontally within the buffer, if it so chooses.
+ */
+ if (cinfo->downsample->need_context_rows) {
+ /* Set up to provide context rows */
+#ifdef CONTEXT_ROWS_SUPPORTED
+ prep->pub.pre_process_data = pre_process_context;
+ create_context_buffer(cinfo);
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+ } else {
+ /* No context, just make it tall enough for one row group */
+ prep->pub.pre_process_data = pre_process_data;
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ prep->color_buf[ci] = (*cinfo->mem->alloc_sarray)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (JDIMENSION)(((long)compptr->width_in_blocks * DCTSIZE *
+ cinfo->max_h_samp_factor) / compptr->h_samp_factor),
+ (JDIMENSION)cinfo->max_v_samp_factor);
+ }
+ }
+}
diff --git a/media/libjpeg/jcsample.c b/media/libjpeg/jcsample.c
new file mode 100644
index 0000000000..e8515ebf0f
--- /dev/null
+++ b/media/libjpeg/jcsample.c
@@ -0,0 +1,522 @@
+/*
+ * jcsample.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015, 2019, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains downsampling routines.
+ *
+ * Downsampling input data is counted in "row groups". A row group
+ * is defined to be max_v_samp_factor pixel rows of each component,
+ * from which the downsampler produces v_samp_factor sample rows.
+ * A single row group is processed in each call to the downsampler module.
+ *
+ * The downsampler is responsible for edge-expansion of its output data
+ * to fill an integral number of DCT blocks horizontally. The source buffer
+ * may be modified if it is helpful for this purpose (the source buffer is
+ * allocated wide enough to correspond to the desired output width).
+ * The caller (the prep controller) is responsible for vertical padding.
+ *
+ * The downsampler may request "context rows" by setting need_context_rows
+ * during startup. In this case, the input arrays will contain at least
+ * one row group's worth of pixels above and below the passed-in data;
+ * the caller will create dummy rows at image top and bottom by replicating
+ * the first or last real pixel row.
+ *
+ * An excellent reference for image resampling is
+ * Digital Image Warping, George Wolberg, 1990.
+ * Pub. by IEEE Computer Society Press, Los Alamitos, CA. ISBN 0-8186-8944-7.
+ *
+ * The downsampling algorithm used here is a simple average of the source
+ * pixels covered by the output pixel. The hi-falutin sampling literature
+ * refers to this as a "box filter". In general the characteristics of a box
+ * filter are not very good, but for the specific cases we normally use (1:1
+ * and 2:1 ratios) the box is equivalent to a "triangle filter" which is not
+ * nearly so bad. If you intend to use other sampling ratios, you'd be well
+ * advised to improve this code.
+ *
+ * A simple input-smoothing capability is provided. This is mainly intended
+ * for cleaning up color-dithered GIF input files (if you find it inadequate,
+ * we suggest using an external filtering program such as pnmconvol). When
+ * enabled, each input pixel P is replaced by a weighted sum of itself and its
+ * eight neighbors. P's weight is 1-8*SF and each neighbor's weight is SF,
+ * where SF = (smoothing_factor / 1024).
+ * Currently, smoothing is only supported for 2h2v sampling factors.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jsimd.h"
+
+
+/* Pointer to routine to downsample a single component */
+typedef void (*downsample1_ptr) (j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data);
+
+/* Private subobject */
+
+typedef struct {
+ struct jpeg_downsampler pub; /* public fields */
+
+ /* Downsampling method pointers, one per component */
+ downsample1_ptr methods[MAX_COMPONENTS];
+} my_downsampler;
+
+typedef my_downsampler *my_downsample_ptr;
+
+
+/*
+ * Initialize for a downsampling pass.
+ */
+
+METHODDEF(void)
+start_pass_downsample(j_compress_ptr cinfo)
+{
+ /* no work for now */
+}
+
+
+/*
+ * Expand a component horizontally from width input_cols to width output_cols,
+ * by duplicating the rightmost samples.
+ */
+
+LOCAL(void)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+ JDIMENSION output_cols)
+{
+ register JSAMPROW ptr;
+ register JSAMPLE pixval;
+ register int count;
+ int row;
+ int numcols = (int)(output_cols - input_cols);
+
+ if (numcols > 0) {
+ for (row = 0; row < num_rows; row++) {
+ ptr = image_data[row] + input_cols;
+ pixval = ptr[-1];
+ for (count = numcols; count > 0; count--)
+ *ptr++ = pixval;
+ }
+ }
+}
+
+
+/*
+ * Do downsampling for a whole row group (all components).
+ *
+ * In this version we simply downsample each component independently.
+ */
+
+METHODDEF(void)
+sep_downsample(j_compress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_index, JSAMPIMAGE output_buf,
+ JDIMENSION out_row_group_index)
+{
+ my_downsample_ptr downsample = (my_downsample_ptr)cinfo->downsample;
+ int ci;
+ jpeg_component_info *compptr;
+ JSAMPARRAY in_ptr, out_ptr;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ in_ptr = input_buf[ci] + in_row_index;
+ out_ptr = output_buf[ci] + (out_row_group_index * compptr->v_samp_factor);
+ (*downsample->methods[ci]) (cinfo, compptr, in_ptr, out_ptr);
+ }
+}
+
+
+/*
+ * Downsample pixel values of a single component.
+ * One row group is processed per call.
+ * This version handles arbitrary integral sampling ratios, without smoothing.
+ * Note that this version is not actually used for customary sampling ratios.
+ */
+
+METHODDEF(void)
+int_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
+ JDIMENSION outcol, outcol_h; /* outcol_h == outcol*h_expand */
+ JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+ JSAMPROW inptr, outptr;
+ JLONG outvalue;
+
+ h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
+ v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor;
+ numpix = h_expand * v_expand;
+ numpix2 = numpix / 2;
+
+ /* Expand input data enough to let all the output samples be generated
+ * by the standard loop. Special-casing padded output would be more
+ * efficient.
+ */
+ expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+ output_cols * h_expand);
+
+ inrow = 0;
+ for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ for (outcol = 0, outcol_h = 0; outcol < output_cols;
+ outcol++, outcol_h += h_expand) {
+ outvalue = 0;
+ for (v = 0; v < v_expand; v++) {
+ inptr = input_data[inrow + v] + outcol_h;
+ for (h = 0; h < h_expand; h++) {
+ outvalue += (JLONG)(*inptr++);
+ }
+ }
+ *outptr++ = (JSAMPLE)((outvalue + numpix2) / numpix);
+ }
+ inrow += v_expand;
+ }
+}
+
+
+/*
+ * Downsample pixel values of a single component.
+ * This version handles the special case of a full-size component,
+ * without smoothing.
+ */
+
+METHODDEF(void)
+fullsize_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ /* Copy the data */
+ jcopy_sample_rows(input_data, 0, output_data, 0, cinfo->max_v_samp_factor,
+ cinfo->image_width);
+ /* Edge-expand */
+ expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width,
+ compptr->width_in_blocks * DCTSIZE);
+}
+
+
+/*
+ * Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ *
+ * A note about the "bias" calculations: when rounding fractional values to
+ * integer, we do not want to always round 0.5 up to the next integer.
+ * If we did that, we'd introduce a noticeable bias towards larger values.
+ * Instead, this code is arranged so that 0.5 will be rounded up or down at
+ * alternate pixel locations (a simple ordered dither pattern).
+ */
+
+METHODDEF(void)
+h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ int outrow;
+ JDIMENSION outcol;
+ JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+ register JSAMPROW inptr, outptr;
+ register int bias;
+
+ /* Expand input data enough to let all the output samples be generated
+ * by the standard loop. Special-casing padded output would be more
+ * efficient.
+ */
+ expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+ output_cols * 2);
+
+ for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr = input_data[outrow];
+ bias = 0; /* bias = 0,1,0,1,... for successive samples */
+ for (outcol = 0; outcol < output_cols; outcol++) {
+ *outptr++ = (JSAMPLE)((inptr[0] + inptr[1] + bias) >> 1);
+ bias ^= 1; /* 0=>1, 1=>0 */
+ inptr += 2;
+ }
+ }
+}
+
+
+/*
+ * Downsample pixel values of a single component.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+METHODDEF(void)
+h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ int inrow, outrow;
+ JDIMENSION outcol;
+ JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+ register JSAMPROW inptr0, inptr1, outptr;
+ register int bias;
+
+ /* Expand input data enough to let all the output samples be generated
+ * by the standard loop. Special-casing padded output would be more
+ * efficient.
+ */
+ expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+ output_cols * 2);
+
+ inrow = 0;
+ for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr0 = input_data[inrow];
+ inptr1 = input_data[inrow + 1];
+ bias = 1; /* bias = 1,2,1,2,... for successive samples */
+ for (outcol = 0; outcol < output_cols; outcol++) {
+ *outptr++ =
+ (JSAMPLE)((inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1] + bias) >> 2);
+ bias ^= 3; /* 1=>2, 2=>1 */
+ inptr0 += 2; inptr1 += 2;
+ }
+ inrow += 2;
+ }
+}
+
+
+#ifdef INPUT_SMOOTHING_SUPPORTED
+
+/*
+ * Downsample pixel values of a single component.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * with smoothing. One row of context is required.
+ */
+
+METHODDEF(void)
+h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ int inrow, outrow;
+ JDIMENSION colctr;
+ JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+ register JSAMPROW inptr0, inptr1, above_ptr, below_ptr, outptr;
+ JLONG membersum, neighsum, memberscale, neighscale;
+
+ /* Expand input data enough to let all the output samples be generated
+ * by the standard loop. Special-casing padded output would be more
+ * efficient.
+ */
+ expand_right_edge(input_data - 1, cinfo->max_v_samp_factor + 2,
+ cinfo->image_width, output_cols * 2);
+
+ /* We don't bother to form the individual "smoothed" input pixel values;
+ * we can directly compute the output which is the average of the four
+ * smoothed values. Each of the four member pixels contributes a fraction
+ * (1-8*SF) to its own smoothed image and a fraction SF to each of the three
+ * other smoothed pixels, therefore a total fraction (1-5*SF)/4 to the final
+ * output. The four corner-adjacent neighbor pixels contribute a fraction
+ * SF to just one smoothed pixel, or SF/4 to the final output; while the
+ * eight edge-adjacent neighbors contribute SF to each of two smoothed
+ * pixels, or SF/2 overall. In order to use integer arithmetic, these
+ * factors are scaled by 2^16 = 65536.
+ * Also recall that SF = smoothing_factor / 1024.
+ */
+
+ memberscale = 16384 - cinfo->smoothing_factor * 80; /* scaled (1-5*SF)/4 */
+ neighscale = cinfo->smoothing_factor * 16; /* scaled SF/4 */
+
+ inrow = 0;
+ for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr0 = input_data[inrow];
+ inptr1 = input_data[inrow + 1];
+ above_ptr = input_data[inrow - 1];
+ below_ptr = input_data[inrow + 2];
+
+ /* Special case for first column: pretend column -1 is same as column 0 */
+ membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+ neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+ inptr0[0] + inptr0[2] + inptr1[0] + inptr1[2];
+ neighsum += neighsum;
+ neighsum += above_ptr[0] + above_ptr[2] + below_ptr[0] + below_ptr[2];
+ membersum = membersum * memberscale + neighsum * neighscale;
+ *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+ inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
+
+ for (colctr = output_cols - 2; colctr > 0; colctr--) {
+ /* sum of pixels directly mapped to this output element */
+ membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+ /* sum of edge-neighbor pixels */
+ neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+ inptr0[-1] + inptr0[2] + inptr1[-1] + inptr1[2];
+ /* The edge-neighbors count twice as much as corner-neighbors */
+ neighsum += neighsum;
+ /* Add in the corner-neighbors */
+ neighsum += above_ptr[-1] + above_ptr[2] + below_ptr[-1] + below_ptr[2];
+ /* form final output scaled up by 2^16 */
+ membersum = membersum * memberscale + neighsum * neighscale;
+ /* round, descale and output it */
+ *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+ inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
+ }
+
+ /* Special case for last column */
+ membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+ neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+ inptr0[-1] + inptr0[1] + inptr1[-1] + inptr1[1];
+ neighsum += neighsum;
+ neighsum += above_ptr[-1] + above_ptr[1] + below_ptr[-1] + below_ptr[1];
+ membersum = membersum * memberscale + neighsum * neighscale;
+ *outptr = (JSAMPLE)((membersum + 32768) >> 16);
+
+ inrow += 2;
+ }
+}
+
+
+/*
+ * Downsample pixel values of a single component.
+ * This version handles the special case of a full-size component,
+ * with smoothing. One row of context is required.
+ */
+
+METHODDEF(void)
+fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ int outrow;
+ JDIMENSION colctr;
+ JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+ register JSAMPROW inptr, above_ptr, below_ptr, outptr;
+ JLONG membersum, neighsum, memberscale, neighscale;
+ int colsum, lastcolsum, nextcolsum;
+
+ /* Expand input data enough to let all the output samples be generated
+ * by the standard loop. Special-casing padded output would be more
+ * efficient.
+ */
+ expand_right_edge(input_data - 1, cinfo->max_v_samp_factor + 2,
+ cinfo->image_width, output_cols);
+
+ /* Each of the eight neighbor pixels contributes a fraction SF to the
+ * smoothed pixel, while the main pixel contributes (1-8*SF). In order
+ * to use integer arithmetic, these factors are multiplied by 2^16 = 65536.
+ * Also recall that SF = smoothing_factor / 1024.
+ */
+
+ memberscale = 65536L - cinfo->smoothing_factor * 512L; /* scaled 1-8*SF */
+ neighscale = cinfo->smoothing_factor * 64; /* scaled SF */
+
+ for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr = input_data[outrow];
+ above_ptr = input_data[outrow - 1];
+ below_ptr = input_data[outrow + 1];
+
+ /* Special case for first column */
+ colsum = (*above_ptr++) + (*below_ptr++) + inptr[0];
+ membersum = *inptr++;
+ nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
+ neighsum = colsum + (colsum - membersum) + nextcolsum;
+ membersum = membersum * memberscale + neighsum * neighscale;
+ *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+ lastcolsum = colsum; colsum = nextcolsum;
+
+ for (colctr = output_cols - 2; colctr > 0; colctr--) {
+ membersum = *inptr++;
+ above_ptr++; below_ptr++;
+ nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
+ neighsum = lastcolsum + (colsum - membersum) + nextcolsum;
+ membersum = membersum * memberscale + neighsum * neighscale;
+ *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+ lastcolsum = colsum; colsum = nextcolsum;
+ }
+
+ /* Special case for last column */
+ membersum = *inptr;
+ neighsum = lastcolsum + (colsum - membersum) + colsum;
+ membersum = membersum * memberscale + neighsum * neighscale;
+ *outptr = (JSAMPLE)((membersum + 32768) >> 16);
+
+ }
+}
+
+#endif /* INPUT_SMOOTHING_SUPPORTED */
+
+
+/*
+ * Module initialization routine for downsampling.
+ * Note that we must select a routine for each component.
+ */
+
+GLOBAL(void)
+jinit_downsampler(j_compress_ptr cinfo)
+{
+ my_downsample_ptr downsample;
+ int ci;
+ jpeg_component_info *compptr;
+ boolean smoothok = TRUE;
+
+ downsample = (my_downsample_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_downsampler));
+ cinfo->downsample = (struct jpeg_downsampler *)downsample;
+ downsample->pub.start_pass = start_pass_downsample;
+ downsample->pub.downsample = sep_downsample;
+ downsample->pub.need_context_rows = FALSE;
+
+ if (cinfo->CCIR601_sampling)
+ ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);
+
+ /* Verify we can handle the sampling factors, and set up method pointers */
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ if (compptr->h_samp_factor == cinfo->max_h_samp_factor &&
+ compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+#ifdef INPUT_SMOOTHING_SUPPORTED
+ if (cinfo->smoothing_factor) {
+ downsample->methods[ci] = fullsize_smooth_downsample;
+ downsample->pub.need_context_rows = TRUE;
+ } else
+#endif
+ downsample->methods[ci] = fullsize_downsample;
+ } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
+ compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+ smoothok = FALSE;
+ if (jsimd_can_h2v1_downsample())
+ downsample->methods[ci] = jsimd_h2v1_downsample;
+ else
+ downsample->methods[ci] = h2v1_downsample;
+ } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
+ compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
+#ifdef INPUT_SMOOTHING_SUPPORTED
+ if (cinfo->smoothing_factor) {
+#if defined(__mips__)
+ if (jsimd_can_h2v2_smooth_downsample())
+ downsample->methods[ci] = jsimd_h2v2_smooth_downsample;
+ else
+#endif
+ downsample->methods[ci] = h2v2_smooth_downsample;
+ downsample->pub.need_context_rows = TRUE;
+ } else
+#endif
+ {
+ if (jsimd_can_h2v2_downsample())
+ downsample->methods[ci] = jsimd_h2v2_downsample;
+ else
+ downsample->methods[ci] = h2v2_downsample;
+ }
+ } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
+ (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
+ smoothok = FALSE;
+ downsample->methods[ci] = int_downsample;
+ } else
+ ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
+ }
+
+#ifdef INPUT_SMOOTHING_SUPPORTED
+ if (cinfo->smoothing_factor && !smoothok)
+ TRACEMS(cinfo, 0, JTRC_SMOOTH_NOTIMPL);
+#endif
+}
diff --git a/media/libjpeg/jctrans.c b/media/libjpeg/jctrans.c
new file mode 100644
index 0000000000..e121028ec7
--- /dev/null
+++ b/media/libjpeg/jctrans.c
@@ -0,0 +1,401 @@
+/*
+ * jctrans.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1995-1998, Thomas G. Lane.
+ * Modified 2000-2009 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains library routines for transcoding compression,
+ * that is, writing raw DCT coefficient arrays to an output JPEG file.
+ * The routines in jcapimin.c will also be needed by a transcoder.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jpegcomp.h"
+
+
+/* Forward declarations */
+LOCAL(void) transencode_master_selection(j_compress_ptr cinfo,
+ jvirt_barray_ptr *coef_arrays);
+LOCAL(void) transencode_coef_controller(j_compress_ptr cinfo,
+ jvirt_barray_ptr *coef_arrays);
+
+
+/*
+ * Compression initialization for writing raw-coefficient data.
+ * Before calling this, all parameters and a data destination must be set up.
+ * Call jpeg_finish_compress() to actually write the data.
+ *
+ * The number of passed virtual arrays must match cinfo->num_components.
+ * Note that the virtual arrays need not be filled or even realized at
+ * the time write_coefficients is called; indeed, if the virtual arrays
+ * were requested from this compression object's memory manager, they
+ * typically will be realized during this routine and filled afterwards.
+ */
+
+GLOBAL(void)
+jpeg_write_coefficients(j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
+{
+ if (cinfo->global_state != CSTATE_START)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ /* Mark all tables to be written */
+ jpeg_suppress_tables(cinfo, FALSE);
+ /* (Re)initialize error mgr and destination modules */
+ (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
+ (*cinfo->dest->init_destination) (cinfo);
+ /* Perform master selection of active modules */
+ transencode_master_selection(cinfo, coef_arrays);
+ /* Wait for jpeg_finish_compress() call */
+ cinfo->next_scanline = 0; /* so jpeg_write_marker works */
+ cinfo->global_state = CSTATE_WRCOEFS;
+}
+
+
+/*
+ * Initialize the compression object with default parameters,
+ * then copy from the source object all parameters needed for lossless
+ * transcoding. Parameters that can be varied without loss (such as
+ * scan script and Huffman optimization) are left in their default states.
+ */
+
+GLOBAL(void)
+jpeg_copy_critical_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo)
+{
+ JQUANT_TBL **qtblptr;
+ jpeg_component_info *incomp, *outcomp;
+ JQUANT_TBL *c_quant, *slot_quant;
+ int tblno, ci, coefi;
+
+ /* Safety check to ensure start_compress not called yet. */
+ if (dstinfo->global_state != CSTATE_START)
+ ERREXIT1(dstinfo, JERR_BAD_STATE, dstinfo->global_state);
+ /* Copy fundamental image dimensions */
+ dstinfo->image_width = srcinfo->image_width;
+ dstinfo->image_height = srcinfo->image_height;
+ dstinfo->input_components = srcinfo->num_components;
+ dstinfo->in_color_space = srcinfo->jpeg_color_space;
+#if JPEG_LIB_VERSION >= 70
+ dstinfo->jpeg_width = srcinfo->output_width;
+ dstinfo->jpeg_height = srcinfo->output_height;
+ dstinfo->min_DCT_h_scaled_size = srcinfo->min_DCT_h_scaled_size;
+ dstinfo->min_DCT_v_scaled_size = srcinfo->min_DCT_v_scaled_size;
+#endif
+ /* Initialize all parameters to default values */
+ jpeg_set_defaults(dstinfo);
+ /* jpeg_set_defaults may choose wrong colorspace, eg YCbCr if input is RGB.
+ * Fix it to get the right header markers for the image colorspace.
+ */
+ jpeg_set_colorspace(dstinfo, srcinfo->jpeg_color_space);
+ dstinfo->data_precision = srcinfo->data_precision;
+ dstinfo->CCIR601_sampling = srcinfo->CCIR601_sampling;
+ /* Copy the source's quantization tables. */
+ for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) {
+ if (srcinfo->quant_tbl_ptrs[tblno] != NULL) {
+ qtblptr = &dstinfo->quant_tbl_ptrs[tblno];
+ if (*qtblptr == NULL)
+ *qtblptr = jpeg_alloc_quant_table((j_common_ptr)dstinfo);
+ memcpy((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval,
+ sizeof((*qtblptr)->quantval));
+ (*qtblptr)->sent_table = FALSE;
+ }
+ }
+ /* Copy the source's per-component info.
+ * Note we assume jpeg_set_defaults has allocated the dest comp_info array.
+ */
+ dstinfo->num_components = srcinfo->num_components;
+ if (dstinfo->num_components < 1 || dstinfo->num_components > MAX_COMPONENTS)
+ ERREXIT2(dstinfo, JERR_COMPONENT_COUNT, dstinfo->num_components,
+ MAX_COMPONENTS);
+ for (ci = 0, incomp = srcinfo->comp_info, outcomp = dstinfo->comp_info;
+ ci < dstinfo->num_components; ci++, incomp++, outcomp++) {
+ outcomp->component_id = incomp->component_id;
+ outcomp->h_samp_factor = incomp->h_samp_factor;
+ outcomp->v_samp_factor = incomp->v_samp_factor;
+ outcomp->quant_tbl_no = incomp->quant_tbl_no;
+ /* Make sure saved quantization table for component matches the qtable
+ * slot. If not, the input file re-used this qtable slot.
+ * IJG encoder currently cannot duplicate this.
+ */
+ tblno = outcomp->quant_tbl_no;
+ if (tblno < 0 || tblno >= NUM_QUANT_TBLS ||
+ srcinfo->quant_tbl_ptrs[tblno] == NULL)
+ ERREXIT1(dstinfo, JERR_NO_QUANT_TABLE, tblno);
+ slot_quant = srcinfo->quant_tbl_ptrs[tblno];
+ c_quant = incomp->quant_table;
+ if (c_quant != NULL) {
+ for (coefi = 0; coefi < DCTSIZE2; coefi++) {
+ if (c_quant->quantval[coefi] != slot_quant->quantval[coefi])
+ ERREXIT1(dstinfo, JERR_MISMATCHED_QUANT_TABLE, tblno);
+ }
+ }
+ /* Note: we do not copy the source's Huffman table assignments;
+ * instead we rely on jpeg_set_colorspace to have made a suitable choice.
+ */
+ }
+ /* Also copy JFIF version and resolution information, if available.
+ * Strictly speaking this isn't "critical" info, but it's nearly
+ * always appropriate to copy it if available. In particular,
+ * if the application chooses to copy JFIF 1.02 extension markers from
+ * the source file, we need to copy the version to make sure we don't
+ * emit a file that has 1.02 extensions but a claimed version of 1.01.
+ * We will *not*, however, copy version info from mislabeled "2.01" files.
+ */
+ if (srcinfo->saw_JFIF_marker) {
+ if (srcinfo->JFIF_major_version == 1) {
+ dstinfo->JFIF_major_version = srcinfo->JFIF_major_version;
+ dstinfo->JFIF_minor_version = srcinfo->JFIF_minor_version;
+ }
+ dstinfo->density_unit = srcinfo->density_unit;
+ dstinfo->X_density = srcinfo->X_density;
+ dstinfo->Y_density = srcinfo->Y_density;
+ }
+}
+
+
+/*
+ * Master selection of compression modules for transcoding.
+ * This substitutes for jcinit.c's initialization of the full compressor.
+ */
+
+LOCAL(void)
+transencode_master_selection(j_compress_ptr cinfo,
+ jvirt_barray_ptr *coef_arrays)
+{
+ /* Although we don't actually use input_components for transcoding,
+ * jcmaster.c's initial_setup will complain if input_components is 0.
+ */
+ cinfo->input_components = 1;
+ /* Initialize master control (includes parameter checking/processing) */
+ jinit_c_master_control(cinfo, TRUE /* transcode only */);
+
+ /* Entropy encoding: either Huffman or arithmetic coding. */
+ if (cinfo->arith_code) {
+#ifdef C_ARITH_CODING_SUPPORTED
+ jinit_arith_encoder(cinfo);
+#else
+ ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+#endif
+ } else {
+ if (cinfo->progressive_mode) {
+#ifdef C_PROGRESSIVE_SUPPORTED
+ jinit_phuff_encoder(cinfo);
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+ } else
+ jinit_huff_encoder(cinfo);
+ }
+
+ /* We need a special coefficient buffer controller. */
+ transencode_coef_controller(cinfo, coef_arrays);
+
+ jinit_marker_writer(cinfo);
+
+ /* We can now tell the memory manager to allocate virtual arrays. */
+ (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
+
+ /* Write the datastream header (SOI, JFIF) immediately.
+ * Frame and scan headers are postponed till later.
+ * This lets application insert special markers after the SOI.
+ */
+ (*cinfo->marker->write_file_header) (cinfo);
+}
+
+
+/*
+ * The rest of this file is a special implementation of the coefficient
+ * buffer controller. This is similar to jccoefct.c, but it handles only
+ * output from presupplied virtual arrays. Furthermore, we generate any
+ * dummy padding blocks on-the-fly rather than expecting them to be present
+ * in the arrays.
+ */
+
+/* Private buffer controller object */
+
+typedef struct {
+ struct jpeg_c_coef_controller pub; /* public fields */
+
+ JDIMENSION iMCU_row_num; /* iMCU row # within image */
+ JDIMENSION mcu_ctr; /* counts MCUs processed in current row */
+ int MCU_vert_offset; /* counts MCU rows within iMCU row */
+ int MCU_rows_per_iMCU_row; /* number of such rows needed */
+
+ /* Virtual block array for each component. */
+ jvirt_barray_ptr *whole_image;
+
+ /* Workspace for constructing dummy blocks at right/bottom edges. */
+ JBLOCKROW dummy_buffer[C_MAX_BLOCKS_IN_MCU];
+} my_coef_controller;
+
+typedef my_coef_controller *my_coef_ptr;
+
+
+LOCAL(void)
+start_iMCU_row(j_compress_ptr cinfo)
+/* Reset within-iMCU-row counters for a new row */
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+
+ /* In an interleaved scan, an MCU row is the same as an iMCU row.
+ * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
+ * But at the bottom of the image, process only what's left.
+ */
+ if (cinfo->comps_in_scan > 1) {
+ coef->MCU_rows_per_iMCU_row = 1;
+ } else {
+ if (coef->iMCU_row_num < (cinfo->total_iMCU_rows - 1))
+ coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
+ else
+ coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
+ }
+
+ coef->mcu_ctr = 0;
+ coef->MCU_vert_offset = 0;
+}
+
+
+/*
+ * Initialize for a processing pass.
+ */
+
+METHODDEF(void)
+start_pass_coef(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+
+ if (pass_mode != JBUF_CRANK_DEST)
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+
+ coef->iMCU_row_num = 0;
+ start_iMCU_row(cinfo);
+}
+
+
+/*
+ * Process some data.
+ * We process the equivalent of one fully interleaved MCU row ("iMCU" row)
+ * per call, ie, v_samp_factor block rows for each component in the scan.
+ * The data is obtained from the virtual arrays and fed to the entropy coder.
+ * Returns TRUE if the iMCU row is completed, FALSE if suspended.
+ *
+ * NB: input_buf is ignored; it is likely to be a NULL pointer.
+ */
+
+METHODDEF(boolean)
+compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+ JDIMENSION MCU_col_num; /* index of current MCU within row */
+ JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
+ JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
+ int blkn, ci, xindex, yindex, yoffset, blockcnt;
+ JDIMENSION start_col;
+ JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN];
+ JBLOCKROW MCU_buffer[C_MAX_BLOCKS_IN_MCU];
+ JBLOCKROW buffer_ptr;
+ jpeg_component_info *compptr;
+
+ /* Align the virtual buffers for the components used in this scan. */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ buffer[ci] = (*cinfo->mem->access_virt_barray)
+ ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
+ coef->iMCU_row_num * compptr->v_samp_factor,
+ (JDIMENSION)compptr->v_samp_factor, FALSE);
+ }
+
+ /* Loop to process one whole iMCU row */
+ for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
+ yoffset++) {
+ for (MCU_col_num = coef->mcu_ctr; MCU_col_num < cinfo->MCUs_per_row;
+ MCU_col_num++) {
+ /* Construct list of pointers to DCT blocks belonging to this MCU */
+ blkn = 0; /* index of current DCT block within MCU */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ start_col = MCU_col_num * compptr->MCU_width;
+ blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width :
+ compptr->last_col_width;
+ for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+ if (coef->iMCU_row_num < last_iMCU_row ||
+ yindex + yoffset < compptr->last_row_height) {
+ /* Fill in pointers to real blocks in this row */
+ buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
+ for (xindex = 0; xindex < blockcnt; xindex++)
+ MCU_buffer[blkn++] = buffer_ptr++;
+ } else {
+ /* At bottom of image, need a whole row of dummy blocks */
+ xindex = 0;
+ }
+ /* Fill in any dummy blocks needed in this row.
+ * Dummy blocks are filled in the same way as in jccoefct.c:
+ * all zeroes in the AC entries, DC entries equal to previous
+ * block's DC value. The init routine has already zeroed the
+ * AC entries, so we need only set the DC entries correctly.
+ */
+ for (; xindex < compptr->MCU_width; xindex++) {
+ MCU_buffer[blkn] = coef->dummy_buffer[blkn];
+ MCU_buffer[blkn][0][0] = MCU_buffer[blkn - 1][0][0];
+ blkn++;
+ }
+ }
+ }
+ /* Try to write the MCU. */
+ if (!(*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) {
+ /* Suspension forced; update state counters and exit */
+ coef->MCU_vert_offset = yoffset;
+ coef->mcu_ctr = MCU_col_num;
+ return FALSE;
+ }
+ }
+ /* Completed an MCU row, but perhaps not an iMCU row */
+ coef->mcu_ctr = 0;
+ }
+ /* Completed the iMCU row, advance counters for next one */
+ coef->iMCU_row_num++;
+ start_iMCU_row(cinfo);
+ return TRUE;
+}
+
+
+/*
+ * Initialize coefficient buffer controller.
+ *
+ * Each passed coefficient array must be the right size for that
+ * coefficient: width_in_blocks wide and height_in_blocks high,
+ * with unitheight at least v_samp_factor.
+ */
+
+LOCAL(void)
+transencode_coef_controller(j_compress_ptr cinfo,
+ jvirt_barray_ptr *coef_arrays)
+{
+ my_coef_ptr coef;
+ JBLOCKROW buffer;
+ int i;
+
+ coef = (my_coef_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_coef_controller));
+ cinfo->coef = (struct jpeg_c_coef_controller *)coef;
+ coef->pub.start_pass = start_pass_coef;
+ coef->pub.compress_data = compress_output;
+
+ /* Save pointer to virtual arrays */
+ coef->whole_image = coef_arrays;
+
+ /* Allocate and pre-zero space for dummy DCT blocks. */
+ buffer = (JBLOCKROW)
+ (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
+ jzero_far((void *)buffer, C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
+ for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
+ coef->dummy_buffer[i] = buffer + i;
+ }
+}
diff --git a/media/libjpeg/jdapimin.c b/media/libjpeg/jdapimin.c
new file mode 100644
index 0000000000..30126a048d
--- /dev/null
+++ b/media/libjpeg/jdapimin.c
@@ -0,0 +1,407 @@
+/*
+ * jdapimin.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1998, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2016, 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains application interface code for the decompression half
+ * of the JPEG library. These are the "minimum" API routines that may be
+ * needed in either the normal full-decompression case or the
+ * transcoding-only case.
+ *
+ * Most of the routines intended to be called directly by an application
+ * are in this file or in jdapistd.c. But also see jcomapi.c for routines
+ * shared by compression and decompression, and jdtrans.c for the transcoding
+ * case.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdmaster.h"
+
+
+/*
+ * Initialization of a JPEG decompression object.
+ * The error manager must already be set up (in case memory manager fails).
+ */
+
+GLOBAL(void)
+jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize)
+{
+ int i;
+
+ /* Guard against version mismatches between library and caller. */
+ cinfo->mem = NULL; /* so jpeg_destroy knows mem mgr not called */
+ if (version != JPEG_LIB_VERSION)
+ ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version);
+ if (structsize != sizeof(struct jpeg_decompress_struct))
+ ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE,
+ (int)sizeof(struct jpeg_decompress_struct), (int)structsize);
+
+ /* For debugging purposes, we zero the whole master structure.
+ * But the application has already set the err pointer, and may have set
+ * client_data, so we have to save and restore those fields.
+ * Note: if application hasn't set client_data, tools like Purify may
+ * complain here.
+ */
+ {
+ struct jpeg_error_mgr *err = cinfo->err;
+ void *client_data = cinfo->client_data; /* ignore Purify complaint here */
+ memset(cinfo, 0, sizeof(struct jpeg_decompress_struct));
+ cinfo->err = err;
+ cinfo->client_data = client_data;
+ }
+ cinfo->is_decompressor = TRUE;
+
+ /* Initialize a memory manager instance for this object */
+ jinit_memory_mgr((j_common_ptr)cinfo);
+
+ /* Zero out pointers to permanent structures. */
+ cinfo->progress = NULL;
+ cinfo->src = NULL;
+
+ for (i = 0; i < NUM_QUANT_TBLS; i++)
+ cinfo->quant_tbl_ptrs[i] = NULL;
+
+ for (i = 0; i < NUM_HUFF_TBLS; i++) {
+ cinfo->dc_huff_tbl_ptrs[i] = NULL;
+ cinfo->ac_huff_tbl_ptrs[i] = NULL;
+ }
+
+ /* Initialize marker processor so application can override methods
+ * for COM, APPn markers before calling jpeg_read_header.
+ */
+ cinfo->marker_list = NULL;
+ jinit_marker_reader(cinfo);
+
+ /* And initialize the overall input controller. */
+ jinit_input_controller(cinfo);
+
+ /* OK, I'm ready */
+ cinfo->global_state = DSTATE_START;
+
+ /* The master struct is used to store extension parameters, so we allocate it
+ * here.
+ */
+ cinfo->master = (struct jpeg_decomp_master *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+ sizeof(my_decomp_master));
+ memset(cinfo->master, 0, sizeof(my_decomp_master));
+}
+
+
+/*
+ * Destruction of a JPEG decompression object
+ */
+
+GLOBAL(void)
+jpeg_destroy_decompress(j_decompress_ptr cinfo)
+{
+ jpeg_destroy((j_common_ptr)cinfo); /* use common routine */
+}
+
+
+/*
+ * Abort processing of a JPEG decompression operation,
+ * but don't destroy the object itself.
+ */
+
+GLOBAL(void)
+jpeg_abort_decompress(j_decompress_ptr cinfo)
+{
+ jpeg_abort((j_common_ptr)cinfo); /* use common routine */
+}
+
+
+/*
+ * Set default decompression parameters.
+ */
+
+LOCAL(void)
+default_decompress_parms(j_decompress_ptr cinfo)
+{
+ /* Guess the input colorspace, and set output colorspace accordingly. */
+ /* (Wish JPEG committee had provided a real way to specify this...) */
+ /* Note application may override our guesses. */
+ switch (cinfo->num_components) {
+ case 1:
+ cinfo->jpeg_color_space = JCS_GRAYSCALE;
+ cinfo->out_color_space = JCS_GRAYSCALE;
+ break;
+
+ case 3:
+ if (cinfo->saw_JFIF_marker) {
+ cinfo->jpeg_color_space = JCS_YCbCr; /* JFIF implies YCbCr */
+ } else if (cinfo->saw_Adobe_marker) {
+ switch (cinfo->Adobe_transform) {
+ case 0:
+ cinfo->jpeg_color_space = JCS_RGB;
+ break;
+ case 1:
+ cinfo->jpeg_color_space = JCS_YCbCr;
+ break;
+ default:
+ WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform);
+ cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */
+ break;
+ }
+ } else {
+ /* Saw no special markers, try to guess from the component IDs */
+ int cid0 = cinfo->comp_info[0].component_id;
+ int cid1 = cinfo->comp_info[1].component_id;
+ int cid2 = cinfo->comp_info[2].component_id;
+
+ if (cid0 == 1 && cid1 == 2 && cid2 == 3)
+ cinfo->jpeg_color_space = JCS_YCbCr; /* assume JFIF w/out marker */
+ else if (cid0 == 82 && cid1 == 71 && cid2 == 66)
+ cinfo->jpeg_color_space = JCS_RGB; /* ASCII 'R', 'G', 'B' */
+ else {
+ TRACEMS3(cinfo, 1, JTRC_UNKNOWN_IDS, cid0, cid1, cid2);
+ cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */
+ }
+ }
+ /* Always guess RGB is proper output colorspace. */
+ cinfo->out_color_space = JCS_RGB;
+ break;
+
+ case 4:
+ if (cinfo->saw_Adobe_marker) {
+ switch (cinfo->Adobe_transform) {
+ case 0:
+ cinfo->jpeg_color_space = JCS_CMYK;
+ break;
+ case 2:
+ cinfo->jpeg_color_space = JCS_YCCK;
+ break;
+ default:
+ WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform);
+ cinfo->jpeg_color_space = JCS_YCCK; /* assume it's YCCK */
+ break;
+ }
+ } else {
+ /* No special markers, assume straight CMYK. */
+ cinfo->jpeg_color_space = JCS_CMYK;
+ }
+ cinfo->out_color_space = JCS_CMYK;
+ break;
+
+ default:
+ cinfo->jpeg_color_space = JCS_UNKNOWN;
+ cinfo->out_color_space = JCS_UNKNOWN;
+ break;
+ }
+
+ /* Set defaults for other decompression parameters. */
+ cinfo->scale_num = 1; /* 1:1 scaling */
+ cinfo->scale_denom = 1;
+ cinfo->output_gamma = 1.0;
+ cinfo->buffered_image = FALSE;
+ cinfo->raw_data_out = FALSE;
+ cinfo->dct_method = JDCT_DEFAULT;
+ cinfo->do_fancy_upsampling = TRUE;
+ cinfo->do_block_smoothing = TRUE;
+ cinfo->quantize_colors = FALSE;
+ /* We set these in case application only sets quantize_colors. */
+ cinfo->dither_mode = JDITHER_FS;
+#ifdef QUANT_2PASS_SUPPORTED
+ cinfo->two_pass_quantize = TRUE;
+#else
+ cinfo->two_pass_quantize = FALSE;
+#endif
+ cinfo->desired_number_of_colors = 256;
+ cinfo->colormap = NULL;
+ /* Initialize for no mode change in buffered-image mode. */
+ cinfo->enable_1pass_quant = FALSE;
+ cinfo->enable_external_quant = FALSE;
+ cinfo->enable_2pass_quant = FALSE;
+}
+
+
+/*
+ * Decompression startup: read start of JPEG datastream to see what's there.
+ * Need only initialize JPEG object and supply a data source before calling.
+ *
+ * This routine will read as far as the first SOS marker (ie, actual start of
+ * compressed data), and will save all tables and parameters in the JPEG
+ * object. It will also initialize the decompression parameters to default
+ * values, and finally return JPEG_HEADER_OK. On return, the application may
+ * adjust the decompression parameters and then call jpeg_start_decompress.
+ * (Or, if the application only wanted to determine the image parameters,
+ * the data need not be decompressed. In that case, call jpeg_abort or
+ * jpeg_destroy to release any temporary space.)
+ * If an abbreviated (tables only) datastream is presented, the routine will
+ * return JPEG_HEADER_TABLES_ONLY upon reaching EOI. The application may then
+ * re-use the JPEG object to read the abbreviated image datastream(s).
+ * It is unnecessary (but OK) to call jpeg_abort in this case.
+ * The JPEG_SUSPENDED return code only occurs if the data source module
+ * requests suspension of the decompressor. In this case the application
+ * should load more source data and then re-call jpeg_read_header to resume
+ * processing.
+ * If a non-suspending data source is used and require_image is TRUE, then the
+ * return code need not be inspected since only JPEG_HEADER_OK is possible.
+ *
+ * This routine is now just a front end to jpeg_consume_input, with some
+ * extra error checking.
+ */
+
+GLOBAL(int)
+jpeg_read_header(j_decompress_ptr cinfo, boolean require_image)
+{
+ int retcode;
+
+ if (cinfo->global_state != DSTATE_START &&
+ cinfo->global_state != DSTATE_INHEADER)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ retcode = jpeg_consume_input(cinfo);
+
+ switch (retcode) {
+ case JPEG_REACHED_SOS:
+ retcode = JPEG_HEADER_OK;
+ break;
+ case JPEG_REACHED_EOI:
+ if (require_image) /* Complain if application wanted an image */
+ ERREXIT(cinfo, JERR_NO_IMAGE);
+ /* Reset to start state; it would be safer to require the application to
+ * call jpeg_abort, but we can't change it now for compatibility reasons.
+ * A side effect is to free any temporary memory (there shouldn't be any).
+ */
+ jpeg_abort((j_common_ptr)cinfo); /* sets state = DSTATE_START */
+ retcode = JPEG_HEADER_TABLES_ONLY;
+ break;
+ case JPEG_SUSPENDED:
+ /* no work */
+ break;
+ }
+
+ return retcode;
+}
+
+
+/*
+ * Consume data in advance of what the decompressor requires.
+ * This can be called at any time once the decompressor object has
+ * been created and a data source has been set up.
+ *
+ * This routine is essentially a state machine that handles a couple
+ * of critical state-transition actions, namely initial setup and
+ * transition from header scanning to ready-for-start_decompress.
+ * All the actual input is done via the input controller's consume_input
+ * method.
+ */
+
+GLOBAL(int)
+jpeg_consume_input(j_decompress_ptr cinfo)
+{
+ int retcode = JPEG_SUSPENDED;
+
+ /* NB: every possible DSTATE value should be listed in this switch */
+ switch (cinfo->global_state) {
+ case DSTATE_START:
+ /* Start-of-datastream actions: reset appropriate modules */
+ (*cinfo->inputctl->reset_input_controller) (cinfo);
+ /* Initialize application's data source module */
+ (*cinfo->src->init_source) (cinfo);
+ cinfo->global_state = DSTATE_INHEADER;
+ FALLTHROUGH /*FALLTHROUGH*/
+ case DSTATE_INHEADER:
+ retcode = (*cinfo->inputctl->consume_input) (cinfo);
+ if (retcode == JPEG_REACHED_SOS) { /* Found SOS, prepare to decompress */
+ /* Set up default parameters based on header data */
+ default_decompress_parms(cinfo);
+ /* Set global state: ready for start_decompress */
+ cinfo->global_state = DSTATE_READY;
+ }
+ break;
+ case DSTATE_READY:
+ /* Can't advance past first SOS until start_decompress is called */
+ retcode = JPEG_REACHED_SOS;
+ break;
+ case DSTATE_PRELOAD:
+ case DSTATE_PRESCAN:
+ case DSTATE_SCANNING:
+ case DSTATE_RAW_OK:
+ case DSTATE_BUFIMAGE:
+ case DSTATE_BUFPOST:
+ case DSTATE_STOPPING:
+ retcode = (*cinfo->inputctl->consume_input) (cinfo);
+ break;
+ default:
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ }
+ return retcode;
+}
+
+
+/*
+ * Have we finished reading the input file?
+ */
+
+GLOBAL(boolean)
+jpeg_input_complete(j_decompress_ptr cinfo)
+{
+ /* Check for valid jpeg object */
+ if (cinfo->global_state < DSTATE_START ||
+ cinfo->global_state > DSTATE_STOPPING)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ return cinfo->inputctl->eoi_reached;
+}
+
+
+/*
+ * Is there more than one scan?
+ */
+
+GLOBAL(boolean)
+jpeg_has_multiple_scans(j_decompress_ptr cinfo)
+{
+ /* Only valid after jpeg_read_header completes */
+ if (cinfo->global_state < DSTATE_READY ||
+ cinfo->global_state > DSTATE_STOPPING)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ return cinfo->inputctl->has_multiple_scans;
+}
+
+
+/*
+ * Finish JPEG decompression.
+ *
+ * This will normally just verify the file trailer and release temp storage.
+ *
+ * Returns FALSE if suspended. The return value need be inspected only if
+ * a suspending data source is used.
+ */
+
+GLOBAL(boolean)
+jpeg_finish_decompress(j_decompress_ptr cinfo)
+{
+ if ((cinfo->global_state == DSTATE_SCANNING ||
+ cinfo->global_state == DSTATE_RAW_OK) && !cinfo->buffered_image) {
+ /* Terminate final pass of non-buffered mode */
+ if (cinfo->output_scanline < cinfo->output_height)
+ ERREXIT(cinfo, JERR_TOO_LITTLE_DATA);
+ (*cinfo->master->finish_output_pass) (cinfo);
+ cinfo->global_state = DSTATE_STOPPING;
+ } else if (cinfo->global_state == DSTATE_BUFIMAGE) {
+ /* Finishing after a buffered-image operation */
+ cinfo->global_state = DSTATE_STOPPING;
+ } else if (cinfo->global_state != DSTATE_STOPPING) {
+ /* STOPPING = repeat call after a suspension, anything else is error */
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ }
+ /* Read until EOI */
+ while (!cinfo->inputctl->eoi_reached) {
+ if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
+ return FALSE; /* Suspend, come back later */
+ }
+ /* Do final cleanup */
+ (*cinfo->src->term_source) (cinfo);
+ /* We can use jpeg_abort to release memory and reset global_state */
+ jpeg_abort((j_common_ptr)cinfo);
+ return TRUE;
+}
diff --git a/media/libjpeg/jdapistd.c b/media/libjpeg/jdapistd.c
new file mode 100644
index 0000000000..02cd0cb93a
--- /dev/null
+++ b/media/libjpeg/jdapistd.c
@@ -0,0 +1,689 @@
+/*
+ * jdapistd.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010, 2015-2020, 2022, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains application interface code for the decompression half
+ * of the JPEG library. These are the "standard" API routines that are
+ * used in the normal full-decompression case. They are not used by a
+ * transcoding-only application. Note that if an application links in
+ * jpeg_start_decompress, it will end up linking in the entire decompressor.
+ * We thus must separate this file from jdapimin.c to avoid linking the
+ * whole decompression library into a transcoder.
+ */
+
+#include "jinclude.h"
+#include "jdmainct.h"
+#include "jdcoefct.h"
+#include "jdmaster.h"
+#include "jdmerge.h"
+#include "jdsample.h"
+#include "jmemsys.h"
+
+/* Forward declarations */
+LOCAL(boolean) output_pass_setup(j_decompress_ptr cinfo);
+
+
+/*
+ * Decompression initialization.
+ * jpeg_read_header must be completed before calling this.
+ *
+ * If a multipass operating mode was selected, this will do all but the
+ * last pass, and thus may take a great deal of time.
+ *
+ * Returns FALSE if suspended. The return value need be inspected only if
+ * a suspending data source is used.
+ */
+
+GLOBAL(boolean)
+jpeg_start_decompress(j_decompress_ptr cinfo)
+{
+ if (cinfo->global_state == DSTATE_READY) {
+ /* First call: initialize master control, select active modules */
+ jinit_master_decompress(cinfo);
+ if (cinfo->buffered_image) {
+ /* No more work here; expecting jpeg_start_output next */
+ cinfo->global_state = DSTATE_BUFIMAGE;
+ return TRUE;
+ }
+ cinfo->global_state = DSTATE_PRELOAD;
+ }
+ if (cinfo->global_state == DSTATE_PRELOAD) {
+ /* If file has multiple scans, absorb them all into the coef buffer */
+ if (cinfo->inputctl->has_multiple_scans) {
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+ for (;;) {
+ int retcode;
+ /* Call progress monitor hook if present */
+ if (cinfo->progress != NULL)
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
+ /* Absorb some more input */
+ retcode = (*cinfo->inputctl->consume_input) (cinfo);
+ if (retcode == JPEG_SUSPENDED)
+ return FALSE;
+ if (retcode == JPEG_REACHED_EOI)
+ break;
+ /* Advance progress counter if appropriate */
+ if (cinfo->progress != NULL &&
+ (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
+ if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
+ /* jdmaster underestimated number of scans; ratchet up one scan */
+ cinfo->progress->pass_limit += (long)cinfo->total_iMCU_rows;
+ }
+ }
+ }
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif /* D_MULTISCAN_FILES_SUPPORTED */
+ }
+ cinfo->output_scan_number = cinfo->input_scan_number;
+ } else if (cinfo->global_state != DSTATE_PRESCAN)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ /* Perform any dummy output passes, and set up for the final pass */
+ return output_pass_setup(cinfo);
+}
+
+
+/*
+ * Set up for an output pass, and perform any dummy pass(es) needed.
+ * Common subroutine for jpeg_start_decompress and jpeg_start_output.
+ * Entry: global_state = DSTATE_PRESCAN only if previously suspended.
+ * Exit: If done, returns TRUE and sets global_state for proper output mode.
+ * If suspended, returns FALSE and sets global_state = DSTATE_PRESCAN.
+ */
+
+LOCAL(boolean)
+output_pass_setup(j_decompress_ptr cinfo)
+{
+ if (cinfo->global_state != DSTATE_PRESCAN) {
+ /* First call: do pass setup */
+ (*cinfo->master->prepare_for_output_pass) (cinfo);
+ cinfo->output_scanline = 0;
+ cinfo->global_state = DSTATE_PRESCAN;
+ }
+ /* Loop over any required dummy passes */
+ while (cinfo->master->is_dummy_pass) {
+#ifdef QUANT_2PASS_SUPPORTED
+ /* Crank through the dummy pass */
+ while (cinfo->output_scanline < cinfo->output_height) {
+ JDIMENSION last_scanline;
+ /* Call progress monitor hook if present */
+ if (cinfo->progress != NULL) {
+ cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+ cinfo->progress->pass_limit = (long)cinfo->output_height;
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
+ }
+ /* Process some data */
+ last_scanline = cinfo->output_scanline;
+ (*cinfo->main->process_data) (cinfo, (JSAMPARRAY)NULL,
+ &cinfo->output_scanline, (JDIMENSION)0);
+ if (cinfo->output_scanline == last_scanline)
+ return FALSE; /* No progress made, must suspend */
+ }
+ /* Finish up dummy pass, and set up for another one */
+ (*cinfo->master->finish_output_pass) (cinfo);
+ (*cinfo->master->prepare_for_output_pass) (cinfo);
+ cinfo->output_scanline = 0;
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif /* QUANT_2PASS_SUPPORTED */
+ }
+ /* Ready for application to drive output pass through
+ * jpeg_read_scanlines or jpeg_read_raw_data.
+ */
+ cinfo->global_state = cinfo->raw_data_out ? DSTATE_RAW_OK : DSTATE_SCANNING;
+ return TRUE;
+}
+
+
+/*
+ * Enable partial scanline decompression
+ *
+ * Must be called after jpeg_start_decompress() and before any calls to
+ * jpeg_read_scanlines() or jpeg_skip_scanlines().
+ *
+ * Refer to libjpeg.txt for more information.
+ */
+
+GLOBAL(void)
+jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+ JDIMENSION *width)
+{
+ int ci, align, orig_downsampled_width;
+ JDIMENSION input_xoffset;
+ boolean reinit_upsampler = FALSE;
+ jpeg_component_info *compptr;
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+#endif
+
+ if ((cinfo->global_state != DSTATE_SCANNING &&
+ cinfo->global_state != DSTATE_BUFIMAGE) || cinfo->output_scanline != 0)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ if (!xoffset || !width)
+ ERREXIT(cinfo, JERR_BAD_CROP_SPEC);
+
+ /* xoffset and width must fall within the output image dimensions. */
+ if (*width == 0 || *xoffset + *width > cinfo->output_width)
+ ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+
+ /* No need to do anything if the caller wants the entire width. */
+ if (*width == cinfo->output_width)
+ return;
+
+ /* Ensuring the proper alignment of xoffset is tricky. At minimum, it
+ * must align with an MCU boundary, because:
+ *
+ * (1) The IDCT is performed in blocks, and it is not feasible to modify
+ * the algorithm so that it can transform partial blocks.
+ * (2) Because of the SIMD extensions, any input buffer passed to the
+ * upsampling and color conversion routines must be aligned to the
+ * SIMD word size (for instance, 128-bit in the case of SSE2.) The
+ * easiest way to accomplish this without copying data is to ensure
+ * that upsampling and color conversion begin at the start of the
+ * first MCU column that will be inverse transformed.
+ *
+ * In practice, we actually impose a stricter alignment requirement. We
+ * require that xoffset be a multiple of the maximum MCU column width of all
+ * of the components (the "iMCU column width.") This is to simplify the
+ * single-pass decompression case, allowing us to use the same MCU column
+ * width for all of the components.
+ */
+ if (cinfo->comps_in_scan == 1 && cinfo->num_components == 1)
+ align = cinfo->_min_DCT_scaled_size;
+ else
+ align = cinfo->_min_DCT_scaled_size * cinfo->max_h_samp_factor;
+
+ /* Adjust xoffset to the nearest iMCU boundary <= the requested value */
+ input_xoffset = *xoffset;
+ *xoffset = (input_xoffset / align) * align;
+
+ /* Adjust the width so that the right edge of the output image is as
+ * requested (only the left edge is altered.) It is important that calling
+ * programs check this value after this function returns, so that they can
+ * allocate an output buffer with the appropriate size.
+ */
+ *width = *width + input_xoffset - *xoffset;
+ cinfo->output_width = *width;
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+ if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+ upsample->out_row_width =
+ cinfo->output_width * cinfo->out_color_components;
+ }
+#endif
+
+ /* Set the first and last iMCU columns that we must decompress. These values
+ * will be used in single-scan decompressions.
+ */
+ cinfo->master->first_iMCU_col = (JDIMENSION)(long)(*xoffset) / (long)align;
+ cinfo->master->last_iMCU_col =
+ (JDIMENSION)jdiv_round_up((long)(*xoffset + cinfo->output_width),
+ (long)align) - 1;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ int hsf = (cinfo->comps_in_scan == 1 && cinfo->num_components == 1) ?
+ 1 : compptr->h_samp_factor;
+
+ /* Set downsampled_width to the new output width. */
+ orig_downsampled_width = compptr->downsampled_width;
+ compptr->downsampled_width =
+ (JDIMENSION)jdiv_round_up((long)(cinfo->output_width *
+ compptr->h_samp_factor),
+ (long)cinfo->max_h_samp_factor);
+ if (compptr->downsampled_width < 2 && orig_downsampled_width >= 2)
+ reinit_upsampler = TRUE;
+
+ /* Set the first and last iMCU columns that we must decompress. These
+ * values will be used in multi-scan decompressions.
+ */
+ cinfo->master->first_MCU_col[ci] =
+ (JDIMENSION)(long)(*xoffset * hsf) / (long)align;
+ cinfo->master->last_MCU_col[ci] =
+ (JDIMENSION)jdiv_round_up((long)((*xoffset + cinfo->output_width) * hsf),
+ (long)align) - 1;
+ }
+
+ if (reinit_upsampler) {
+ cinfo->master->jinit_upsampler_no_alloc = TRUE;
+ jinit_upsampler(cinfo);
+ cinfo->master->jinit_upsampler_no_alloc = FALSE;
+ }
+}
+
+
+/*
+ * Read some scanlines of data from the JPEG decompressor.
+ *
+ * The return value will be the number of lines actually read.
+ * This may be less than the number requested in several cases,
+ * including bottom of image, data source suspension, and operating
+ * modes that emit multiple scanlines at a time.
+ *
+ * Note: we warn about excess calls to jpeg_read_scanlines() since
+ * this likely signals an application programmer error. However,
+ * an oversize buffer (max_lines > scanlines remaining) is not an error.
+ */
+
+GLOBAL(JDIMENSION)
+jpeg_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+ JDIMENSION max_lines)
+{
+ JDIMENSION row_ctr;
+
+ if (cinfo->global_state != DSTATE_SCANNING)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ if (cinfo->output_scanline >= cinfo->output_height) {
+ WARNMS(cinfo, JWRN_TOO_MUCH_DATA);
+ return 0;
+ }
+
+ /* Call progress monitor hook if present */
+ if (cinfo->progress != NULL) {
+ cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+ cinfo->progress->pass_limit = (long)cinfo->output_height;
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
+ }
+
+ /* Process some data */
+ row_ctr = 0;
+ (*cinfo->main->process_data) (cinfo, scanlines, &row_ctr, max_lines);
+ cinfo->output_scanline += row_ctr;
+ return row_ctr;
+}
+
+
+/* Dummy color convert function used by jpeg_skip_scanlines() */
+LOCAL(void)
+noop_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+
+/* Dummy quantize function used by jpeg_skip_scanlines() */
+LOCAL(void)
+noop_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+
+/*
+ * In some cases, it is best to call jpeg_read_scanlines() and discard the
+ * output, rather than skipping the scanlines, because this allows us to
+ * maintain the internal state of the context-based upsampler. In these cases,
+ * we set up and tear down a dummy color converter in order to avoid valgrind
+ * errors and to achieve the best possible performance.
+ */
+
+LOCAL(void)
+read_and_discard_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
+{
+ JDIMENSION n;
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+#endif
+ JSAMPLE dummy_sample[1] = { 0 };
+ JSAMPROW dummy_row = dummy_sample;
+ JSAMPARRAY scanlines = NULL;
+ void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows) = NULL;
+ void (*color_quantize) (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows) = NULL;
+
+ if (cinfo->cconvert && cinfo->cconvert->color_convert) {
+ color_convert = cinfo->cconvert->color_convert;
+ cinfo->cconvert->color_convert = noop_convert;
+ /* This just prevents UBSan from complaining about adding 0 to a NULL
+ * pointer. The pointer isn't actually used.
+ */
+ scanlines = &dummy_row;
+ }
+
+ if (cinfo->cquantize && cinfo->cquantize->color_quantize) {
+ color_quantize = cinfo->cquantize->color_quantize;
+ cinfo->cquantize->color_quantize = noop_quantize;
+ }
+
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+ if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+ scanlines = &upsample->spare_row;
+ }
+#endif
+
+ for (n = 0; n < num_lines; n++)
+ jpeg_read_scanlines(cinfo, scanlines, 1);
+
+ if (color_convert)
+ cinfo->cconvert->color_convert = color_convert;
+
+ if (color_quantize)
+ cinfo->cquantize->color_quantize = color_quantize;
+}
+
+
+/*
+ * Called by jpeg_skip_scanlines(). This partially skips a decompress block by
+ * incrementing the rowgroup counter.
+ */
+
+LOCAL(void)
+increment_simple_rowgroup_ctr(j_decompress_ptr cinfo, JDIMENSION rows)
+{
+ JDIMENSION rows_left;
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+
+ if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+ read_and_discard_scanlines(cinfo, rows);
+ return;
+ }
+
+ /* Increment the counter to the next row group after the skipped rows. */
+ main_ptr->rowgroup_ctr += rows / cinfo->max_v_samp_factor;
+
+ /* Partially skipping a row group would involve modifying the internal state
+ * of the upsampler, so read the remaining rows into a dummy buffer instead.
+ */
+ rows_left = rows % cinfo->max_v_samp_factor;
+ cinfo->output_scanline += rows - rows_left;
+
+ read_and_discard_scanlines(cinfo, rows_left);
+}
+
+/*
+ * Skips some scanlines of data from the JPEG decompressor.
+ *
+ * The return value will be the number of lines actually skipped. If skipping
+ * num_lines would move beyond the end of the image, then the actual number of
+ * lines remaining in the image is returned. Otherwise, the return value will
+ * be equal to num_lines.
+ *
+ * Refer to libjpeg.txt for more information.
+ */
+
+GLOBAL(JDIMENSION)
+jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
+{
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+ my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
+ JDIMENSION i, x;
+ int y;
+ JDIMENSION lines_per_iMCU_row, lines_left_in_iMCU_row, lines_after_iMCU_row;
+ JDIMENSION lines_to_skip, lines_to_read;
+
+ /* Two-pass color quantization is not supported. */
+ if (cinfo->quantize_colors && cinfo->two_pass_quantize)
+ ERREXIT(cinfo, JERR_NOTIMPL);
+
+ if (cinfo->global_state != DSTATE_SCANNING)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ /* Do not skip past the bottom of the image. */
+ if (cinfo->output_scanline + num_lines >= cinfo->output_height) {
+ num_lines = cinfo->output_height - cinfo->output_scanline;
+ cinfo->output_scanline = cinfo->output_height;
+ (*cinfo->inputctl->finish_input_pass) (cinfo);
+ cinfo->inputctl->eoi_reached = TRUE;
+ return num_lines;
+ }
+
+ if (num_lines == 0)
+ return 0;
+
+ lines_per_iMCU_row = cinfo->_min_DCT_scaled_size * cinfo->max_v_samp_factor;
+ lines_left_in_iMCU_row =
+ (lines_per_iMCU_row - (cinfo->output_scanline % lines_per_iMCU_row)) %
+ lines_per_iMCU_row;
+ lines_after_iMCU_row = num_lines - lines_left_in_iMCU_row;
+
+ /* Skip the lines remaining in the current iMCU row. When upsampling
+ * requires context rows, we need the previous and next rows in order to read
+ * the current row. This adds some complexity.
+ */
+ if (cinfo->upsample->need_context_rows) {
+ /* If the skipped lines would not move us past the current iMCU row, we
+ * read the lines and ignore them. There might be a faster way of doing
+ * this, but we are facing increasing complexity for diminishing returns.
+ * The increasing complexity would be a by-product of meddling with the
+ * state machine used to skip context rows. Near the end of an iMCU row,
+ * the next iMCU row may have already been entropy-decoded. In this unique
+ * case, we will read the next iMCU row if we cannot skip past it as well.
+ */
+ if ((num_lines < lines_left_in_iMCU_row + 1) ||
+ (lines_left_in_iMCU_row <= 1 && main_ptr->buffer_full &&
+ lines_after_iMCU_row < lines_per_iMCU_row + 1)) {
+ read_and_discard_scanlines(cinfo, num_lines);
+ return num_lines;
+ }
+
+ /* If the next iMCU row has already been entropy-decoded, make sure that
+ * we do not skip too far.
+ */
+ if (lines_left_in_iMCU_row <= 1 && main_ptr->buffer_full) {
+ cinfo->output_scanline += lines_left_in_iMCU_row + lines_per_iMCU_row;
+ lines_after_iMCU_row -= lines_per_iMCU_row;
+ } else {
+ cinfo->output_scanline += lines_left_in_iMCU_row;
+ }
+
+ /* If we have just completed the first block, adjust the buffer pointers */
+ if (main_ptr->iMCU_row_ctr == 0 ||
+ (main_ptr->iMCU_row_ctr == 1 && lines_left_in_iMCU_row > 2))
+ set_wraparound_pointers(cinfo);
+ main_ptr->buffer_full = FALSE;
+ main_ptr->rowgroup_ctr = 0;
+ main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
+ if (!master->using_merged_upsample) {
+ upsample->next_row_out = cinfo->max_v_samp_factor;
+ upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+ }
+ }
+
+ /* Skipping is much simpler when context rows are not required. */
+ else {
+ if (num_lines < lines_left_in_iMCU_row) {
+ increment_simple_rowgroup_ctr(cinfo, num_lines);
+ return num_lines;
+ } else {
+ cinfo->output_scanline += lines_left_in_iMCU_row;
+ main_ptr->buffer_full = FALSE;
+ main_ptr->rowgroup_ctr = 0;
+ if (!master->using_merged_upsample) {
+ upsample->next_row_out = cinfo->max_v_samp_factor;
+ upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+ }
+ }
+ }
+
+ /* Calculate how many full iMCU rows we can skip. */
+ if (cinfo->upsample->need_context_rows)
+ lines_to_skip = ((lines_after_iMCU_row - 1) / lines_per_iMCU_row) *
+ lines_per_iMCU_row;
+ else
+ lines_to_skip = (lines_after_iMCU_row / lines_per_iMCU_row) *
+ lines_per_iMCU_row;
+ /* Calculate the number of lines that remain to be skipped after skipping all
+ * of the full iMCU rows that we can. We will not read these lines unless we
+ * have to.
+ */
+ lines_to_read = lines_after_iMCU_row - lines_to_skip;
+
+ /* For images requiring multiple scans (progressive, non-interleaved, etc.),
+ * all of the entropy decoding occurs in jpeg_start_decompress(), assuming
+ * that the input data source is non-suspending. This makes skipping easy.
+ */
+ if (cinfo->inputctl->has_multiple_scans || cinfo->buffered_image) {
+ if (cinfo->upsample->need_context_rows) {
+ cinfo->output_scanline += lines_to_skip;
+ cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
+ main_ptr->iMCU_row_ctr += lines_to_skip / lines_per_iMCU_row;
+ /* It is complex to properly move to the middle of a context block, so
+ * read the remaining lines instead of skipping them.
+ */
+ read_and_discard_scanlines(cinfo, lines_to_read);
+ } else {
+ cinfo->output_scanline += lines_to_skip;
+ cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
+ increment_simple_rowgroup_ctr(cinfo, lines_to_read);
+ }
+ if (!master->using_merged_upsample)
+ upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+ return num_lines;
+ }
+
+ /* Skip the iMCU rows that we can safely skip. */
+ for (i = 0; i < lines_to_skip; i += lines_per_iMCU_row) {
+ for (y = 0; y < coef->MCU_rows_per_iMCU_row; y++) {
+ for (x = 0; x < cinfo->MCUs_per_row; x++) {
+ /* Calling decode_mcu() with a NULL pointer causes it to discard the
+ * decoded coefficients. This is ~5% faster for large subsets, but
+ * it's tough to tell a difference for smaller images.
+ */
+ if (!cinfo->entropy->insufficient_data)
+ cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
+ (*cinfo->entropy->decode_mcu) (cinfo, NULL);
+ }
+ }
+ cinfo->input_iMCU_row++;
+ cinfo->output_iMCU_row++;
+ if (cinfo->input_iMCU_row < cinfo->total_iMCU_rows)
+ start_iMCU_row(cinfo);
+ else
+ (*cinfo->inputctl->finish_input_pass) (cinfo);
+ }
+ cinfo->output_scanline += lines_to_skip;
+
+ if (cinfo->upsample->need_context_rows) {
+ /* Context-based upsampling keeps track of iMCU rows. */
+ main_ptr->iMCU_row_ctr += lines_to_skip / lines_per_iMCU_row;
+
+ /* It is complex to properly move to the middle of a context block, so
+ * read the remaining lines instead of skipping them.
+ */
+ read_and_discard_scanlines(cinfo, lines_to_read);
+ } else {
+ increment_simple_rowgroup_ctr(cinfo, lines_to_read);
+ }
+
+ /* Since skipping lines involves skipping the upsampling step, the value of
+ * "rows_to_go" will become invalid unless we set it here. NOTE: This is a
+ * bit odd, since "rows_to_go" seems to be redundantly keeping track of
+ * output_scanline.
+ */
+ if (!master->using_merged_upsample)
+ upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+
+ /* Always skip the requested number of lines. */
+ return num_lines;
+}
+
+/*
+ * Alternate entry point to read raw data.
+ * Processes exactly one iMCU row per call, unless suspended.
+ */
+
+GLOBAL(JDIMENSION)
+jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+ JDIMENSION max_lines)
+{
+ JDIMENSION lines_per_iMCU_row;
+
+ if (cinfo->global_state != DSTATE_RAW_OK)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ if (cinfo->output_scanline >= cinfo->output_height) {
+ WARNMS(cinfo, JWRN_TOO_MUCH_DATA);
+ return 0;
+ }
+
+ /* Call progress monitor hook if present */
+ if (cinfo->progress != NULL) {
+ cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+ cinfo->progress->pass_limit = (long)cinfo->output_height;
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
+ }
+
+ /* Verify that at least one iMCU row can be returned. */
+ lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size;
+ if (max_lines < lines_per_iMCU_row)
+ ERREXIT(cinfo, JERR_BUFFER_SIZE);
+
+ /* Decompress directly into user's buffer. */
+ if (!(*cinfo->coef->decompress_data) (cinfo, data))
+ return 0; /* suspension forced, can do nothing more */
+
+ /* OK, we processed one iMCU row. */
+ cinfo->output_scanline += lines_per_iMCU_row;
+ return lines_per_iMCU_row;
+}
+
+
+/* Additional entry points for buffered-image mode. */
+
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+
+/*
+ * Initialize for an output pass in buffered-image mode.
+ */
+
+GLOBAL(boolean)
+jpeg_start_output(j_decompress_ptr cinfo, int scan_number)
+{
+ if (cinfo->global_state != DSTATE_BUFIMAGE &&
+ cinfo->global_state != DSTATE_PRESCAN)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ /* Limit scan number to valid range */
+ if (scan_number <= 0)
+ scan_number = 1;
+ if (cinfo->inputctl->eoi_reached && scan_number > cinfo->input_scan_number)
+ scan_number = cinfo->input_scan_number;
+ cinfo->output_scan_number = scan_number;
+ /* Perform any dummy output passes, and set up for the real pass */
+ return output_pass_setup(cinfo);
+}
+
+
+/*
+ * Finish up after an output pass in buffered-image mode.
+ *
+ * Returns FALSE if suspended. The return value need be inspected only if
+ * a suspending data source is used.
+ */
+
+GLOBAL(boolean)
+jpeg_finish_output(j_decompress_ptr cinfo)
+{
+ if ((cinfo->global_state == DSTATE_SCANNING ||
+ cinfo->global_state == DSTATE_RAW_OK) && cinfo->buffered_image) {
+ /* Terminate this pass. */
+ /* We do not require the whole pass to have been completed. */
+ (*cinfo->master->finish_output_pass) (cinfo);
+ cinfo->global_state = DSTATE_BUFPOST;
+ } else if (cinfo->global_state != DSTATE_BUFPOST) {
+ /* BUFPOST = repeat call after a suspension, anything else is error */
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ }
+ /* Read markers looking for SOS or EOI */
+ while (cinfo->input_scan_number <= cinfo->output_scan_number &&
+ !cinfo->inputctl->eoi_reached) {
+ if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
+ return FALSE; /* Suspend, come back later */
+ }
+ cinfo->global_state = DSTATE_BUFIMAGE;
+ return TRUE;
+}
+
+#endif /* D_MULTISCAN_FILES_SUPPORTED */
diff --git a/media/libjpeg/jdarith.c b/media/libjpeg/jdarith.c
new file mode 100644
index 0000000000..21575e80c7
--- /dev/null
+++ b/media/libjpeg/jdarith.c
@@ -0,0 +1,782 @@
+/*
+ * jdarith.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Developed 1997-2015 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015-2020, 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains portable arithmetic entropy encoding routines for JPEG
+ * (implementing Recommendation ITU-T T.81 | ISO/IEC 10918-1).
+ *
+ * Both sequential and progressive modes are supported in this single module.
+ *
+ * Suspension is not currently supported in this module.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+#define NEG_1 ((unsigned int)-1)
+
+
+/* Expanded entropy decoder object for arithmetic decoding. */
+
+typedef struct {
+ struct jpeg_entropy_decoder pub; /* public fields */
+
+ JLONG c; /* C register, base of coding interval + input bit buffer */
+ JLONG a; /* A register, normalized size of coding interval */
+ int ct; /* bit shift counter, # of bits left in bit buffer part of C */
+ /* init: ct = -16 */
+ /* run: ct = 0..7 */
+ /* error: ct = -1 */
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+ int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
+
+ unsigned int restarts_to_go; /* MCUs left in this restart interval */
+
+ /* Pointers to statistics areas (these workspaces have image lifespan) */
+ unsigned char *dc_stats[NUM_ARITH_TBLS];
+ unsigned char *ac_stats[NUM_ARITH_TBLS];
+
+ /* Statistics bin for coding with fixed probability 0.5 */
+ unsigned char fixed_bin[4];
+} arith_entropy_decoder;
+
+typedef arith_entropy_decoder *arith_entropy_ptr;
+
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+
+LOCAL(int)
+get_byte(j_decompress_ptr cinfo)
+/* Read next input byte; we do not support suspension in this module. */
+{
+ struct jpeg_source_mgr *src = cinfo->src;
+
+ if (src->bytes_in_buffer == 0)
+ if (!(*src->fill_input_buffer) (cinfo))
+ ERREXIT(cinfo, JERR_CANT_SUSPEND);
+ src->bytes_in_buffer--;
+ return *src->next_input_byte++;
+}
+
+
+/*
+ * The core arithmetic decoding routine (common in JPEG and JBIG).
+ * This needs to go as fast as possible.
+ * Machine-dependent optimization facilities
+ * are not utilized in this portable implementation.
+ * However, this code should be fairly efficient and
+ * may be a good base for further optimizations anyway.
+ *
+ * Return value is 0 or 1 (binary decision).
+ *
+ * Note: I've changed the handling of the code base & bit
+ * buffer register C compared to other implementations
+ * based on the standards layout & procedures.
+ * While it also contains both the actual base of the
+ * coding interval (16 bits) and the next-bits buffer,
+ * the cut-point between these two parts is floating
+ * (instead of fixed) with the bit shift counter CT.
+ * Thus, we also need only one (variable instead of
+ * fixed size) shift for the LPS/MPS decision, and
+ * we can do away with any renormalization update
+ * of C (except for new data insertion, of course).
+ *
+ * I've also introduced a new scheme for accessing
+ * the probability estimation state machine table,
+ * derived from Markus Kuhn's JBIG implementation.
+ */
+
+LOCAL(int)
+arith_decode(j_decompress_ptr cinfo, unsigned char *st)
+{
+ register arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
+ register unsigned char nl, nm;
+ register JLONG qe, temp;
+ register int sv, data;
+
+ /* Renormalization & data input per section D.2.6 */
+ while (e->a < 0x8000L) {
+ if (--e->ct < 0) {
+ /* Need to fetch next data byte */
+ if (cinfo->unread_marker)
+ data = 0; /* stuff zero data */
+ else {
+ data = get_byte(cinfo); /* read next input byte */
+ if (data == 0xFF) { /* zero stuff or marker code */
+ do data = get_byte(cinfo);
+ while (data == 0xFF); /* swallow extra 0xFF bytes */
+ if (data == 0)
+ data = 0xFF; /* discard stuffed zero byte */
+ else {
+ /* Note: Different from the Huffman decoder, hitting
+ * a marker while processing the compressed data
+ * segment is legal in arithmetic coding.
+ * The convention is to supply zero data
+ * then until decoding is complete.
+ */
+ cinfo->unread_marker = data;
+ data = 0;
+ }
+ }
+ }
+ e->c = (e->c << 8) | data; /* insert data into C register */
+ if ((e->ct += 8) < 0) /* update bit shift counter */
+ /* Need more initial bytes */
+ if (++e->ct == 0)
+ /* Got 2 initial bytes -> re-init A and exit loop */
+ e->a = 0x8000L; /* => e->a = 0x10000L after loop exit */
+ }
+ e->a <<= 1;
+ }
+
+ /* Fetch values from our compact representation of Table D.2:
+ * Qe values and probability estimation state machine
+ */
+ sv = *st;
+ qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */
+ nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */
+ nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */
+
+ /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
+ temp = e->a - qe;
+ e->a = temp;
+ temp <<= e->ct;
+ if (e->c >= temp) {
+ e->c -= temp;
+ /* Conditional LPS (less probable symbol) exchange */
+ if (e->a < qe) {
+ e->a = qe;
+ *st = (sv & 0x80) ^ nm; /* Estimate_after_MPS */
+ } else {
+ e->a = qe;
+ *st = (sv & 0x80) ^ nl; /* Estimate_after_LPS */
+ sv ^= 0x80; /* Exchange LPS/MPS */
+ }
+ } else if (e->a < 0x8000L) {
+ /* Conditional MPS (more probable symbol) exchange */
+ if (e->a < qe) {
+ *st = (sv & 0x80) ^ nl; /* Estimate_after_LPS */
+ sv ^= 0x80; /* Exchange LPS/MPS */
+ } else {
+ *st = (sv & 0x80) ^ nm; /* Estimate_after_MPS */
+ }
+ }
+
+ return sv >> 7;
+}
+
+
+/*
+ * Check for a restart marker & resynchronize decoder.
+ */
+
+LOCAL(void)
+process_restart(j_decompress_ptr cinfo)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ int ci;
+ jpeg_component_info *compptr;
+
+ /* Advance past the RSTn marker */
+ if (!(*cinfo->marker->read_restart_marker) (cinfo))
+ ERREXIT(cinfo, JERR_CANT_SUSPEND);
+
+ /* Re-initialize statistics areas */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+ memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
+ /* Reset DC predictions to 0 */
+ entropy->last_dc_val[ci] = 0;
+ entropy->dc_context[ci] = 0;
+ }
+ if (!cinfo->progressive_mode || cinfo->Ss) {
+ memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
+ }
+ }
+
+ /* Reset arithmetic decoding variables */
+ entropy->c = 0;
+ entropy->a = 0;
+ entropy->ct = -16; /* force reading 2 initial bytes to fill C */
+
+ /* Reset restart counter */
+ entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Arithmetic MCU decoding.
+ * Each of these routines decodes and returns one MCU's worth of
+ * arithmetic-compressed coefficients.
+ * The coefficients are reordered from zigzag order into natural array order,
+ * but are not dequantized.
+ *
+ * The i'th block of the MCU is stored into the block pointed to by
+ * MCU_data[i]. WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
+ */
+
+/*
+ * MCU decoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ JBLOCKROW block;
+ unsigned char *st;
+ int blkn, ci, tbl, sign;
+ int v, m;
+
+ /* Process restart marker if needed */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ process_restart(cinfo);
+ entropy->restarts_to_go--;
+ }
+
+ if (entropy->ct == -1) return TRUE; /* if error do nothing */
+
+ /* Outer loop handles each block in the MCU */
+
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ block = MCU_data[blkn];
+ ci = cinfo->MCU_membership[blkn];
+ tbl = cinfo->cur_comp_info[ci]->dc_tbl_no;
+
+ /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */
+
+ /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+ st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+ /* Figure F.19: Decode_DC_DIFF */
+ if (arith_decode(cinfo, st) == 0)
+ entropy->dc_context[ci] = 0;
+ else {
+ /* Figure F.21: Decoding nonzero value v */
+ /* Figure F.22: Decoding the sign of v */
+ sign = arith_decode(cinfo, st + 1);
+ st += 2; st += sign;
+ /* Figure F.23: Decoding the magnitude category of v */
+ if ((m = arith_decode(cinfo, st)) != 0) {
+ st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+ while (arith_decode(cinfo, st)) {
+ if ((m <<= 1) == 0x8000) {
+ WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+ entropy->ct = -1; /* magnitude overflow */
+ return TRUE;
+ }
+ st += 1;
+ }
+ }
+ /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+ if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
+ entropy->dc_context[ci] = 0; /* zero diff category */
+ else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
+ entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+ else
+ entropy->dc_context[ci] = 4 + (sign * 4); /* small diff category */
+ v = m;
+ /* Figure F.24: Decoding the magnitude bit pattern of v */
+ st += 14;
+ while (m >>= 1)
+ if (arith_decode(cinfo, st)) v |= m;
+ v += 1; if (sign) v = -v;
+ entropy->last_dc_val[ci] = (entropy->last_dc_val[ci] + v) & 0xffff;
+ }
+
+ /* Scale and output the DC coefficient (assumes jpeg_natural_order[0]=0) */
+ (*block)[0] = (JCOEF)LEFT_SHIFT(entropy->last_dc_val[ci], cinfo->Al);
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ JBLOCKROW block;
+ unsigned char *st;
+ int tbl, sign, k;
+ int v, m;
+
+ /* Process restart marker if needed */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ process_restart(cinfo);
+ entropy->restarts_to_go--;
+ }
+
+ if (entropy->ct == -1) return TRUE; /* if error do nothing */
+
+ /* There is always only one block per MCU */
+ block = MCU_data[0];
+ tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+ /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
+
+ /* Figure F.20: Decode_AC_coefficients */
+ for (k = cinfo->Ss; k <= cinfo->Se; k++) {
+ st = entropy->ac_stats[tbl] + 3 * (k - 1);
+ if (arith_decode(cinfo, st)) break; /* EOB flag */
+ while (arith_decode(cinfo, st + 1) == 0) {
+ st += 3; k++;
+ if (k > cinfo->Se) {
+ WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+ entropy->ct = -1; /* spectral overflow */
+ return TRUE;
+ }
+ }
+ /* Figure F.21: Decoding nonzero value v */
+ /* Figure F.22: Decoding the sign of v */
+ sign = arith_decode(cinfo, entropy->fixed_bin);
+ st += 2;
+ /* Figure F.23: Decoding the magnitude category of v */
+ if ((m = arith_decode(cinfo, st)) != 0) {
+ if (arith_decode(cinfo, st)) {
+ m <<= 1;
+ st = entropy->ac_stats[tbl] +
+ (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+ while (arith_decode(cinfo, st)) {
+ if ((m <<= 1) == 0x8000) {
+ WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+ entropy->ct = -1; /* magnitude overflow */
+ return TRUE;
+ }
+ st += 1;
+ }
+ }
+ }
+ v = m;
+ /* Figure F.24: Decoding the magnitude bit pattern of v */
+ st += 14;
+ while (m >>= 1)
+ if (arith_decode(cinfo, st)) v |= m;
+ v += 1; if (sign) v = -v;
+ /* Scale and output coefficient in natural (dezigzagged) order */
+ (*block)[jpeg_natural_order[k]] = (JCOEF)((unsigned)v << cinfo->Al);
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * MCU decoding for DC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ unsigned char *st;
+ int p1, blkn;
+
+ /* Process restart marker if needed */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ process_restart(cinfo);
+ entropy->restarts_to_go--;
+ }
+
+ st = entropy->fixed_bin; /* use fixed probability estimation */
+ p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */
+
+ /* Outer loop handles each block in the MCU */
+
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ /* Encoded data is simply the next bit of the two's-complement DC value */
+ if (arith_decode(cinfo, st))
+ MCU_data[blkn][0][0] |= p1;
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ JBLOCKROW block;
+ JCOEFPTR thiscoef;
+ unsigned char *st;
+ int tbl, k, kex;
+ int p1, m1;
+
+ /* Process restart marker if needed */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ process_restart(cinfo);
+ entropy->restarts_to_go--;
+ }
+
+ if (entropy->ct == -1) return TRUE; /* if error do nothing */
+
+ /* There is always only one block per MCU */
+ block = MCU_data[0];
+ tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+ p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */
+ m1 = (NEG_1) << cinfo->Al; /* -1 in the bit position being coded */
+
+ /* Establish EOBx (previous stage end-of-block) index */
+ for (kex = cinfo->Se; kex > 0; kex--)
+ if ((*block)[jpeg_natural_order[kex]]) break;
+
+ for (k = cinfo->Ss; k <= cinfo->Se; k++) {
+ st = entropy->ac_stats[tbl] + 3 * (k - 1);
+ if (k > kex)
+ if (arith_decode(cinfo, st)) break; /* EOB flag */
+ for (;;) {
+ thiscoef = *block + jpeg_natural_order[k];
+ if (*thiscoef) { /* previously nonzero coef */
+ if (arith_decode(cinfo, st + 2)) {
+ if (*thiscoef < 0)
+ *thiscoef += (JCOEF)m1;
+ else
+ *thiscoef += (JCOEF)p1;
+ }
+ break;
+ }
+ if (arith_decode(cinfo, st + 1)) { /* newly nonzero coef */
+ if (arith_decode(cinfo, entropy->fixed_bin))
+ *thiscoef = (JCOEF)m1;
+ else
+ *thiscoef = (JCOEF)p1;
+ break;
+ }
+ st += 3; k++;
+ if (k > cinfo->Se) {
+ WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+ entropy->ct = -1; /* spectral overflow */
+ return TRUE;
+ }
+ }
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * Decode one MCU's worth of arithmetic-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+decode_mcu(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ jpeg_component_info *compptr;
+ JBLOCKROW block;
+ unsigned char *st;
+ int blkn, ci, tbl, sign, k;
+ int v, m;
+
+ /* Process restart marker if needed */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ process_restart(cinfo);
+ entropy->restarts_to_go--;
+ }
+
+ if (entropy->ct == -1) return TRUE; /* if error do nothing */
+
+ /* Outer loop handles each block in the MCU */
+
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ block = MCU_data ? MCU_data[blkn] : NULL;
+ ci = cinfo->MCU_membership[blkn];
+ compptr = cinfo->cur_comp_info[ci];
+
+ /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */
+
+ tbl = compptr->dc_tbl_no;
+
+ /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+ st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+ /* Figure F.19: Decode_DC_DIFF */
+ if (arith_decode(cinfo, st) == 0)
+ entropy->dc_context[ci] = 0;
+ else {
+ /* Figure F.21: Decoding nonzero value v */
+ /* Figure F.22: Decoding the sign of v */
+ sign = arith_decode(cinfo, st + 1);
+ st += 2; st += sign;
+ /* Figure F.23: Decoding the magnitude category of v */
+ if ((m = arith_decode(cinfo, st)) != 0) {
+ st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+ while (arith_decode(cinfo, st)) {
+ if ((m <<= 1) == 0x8000) {
+ WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+ entropy->ct = -1; /* magnitude overflow */
+ return TRUE;
+ }
+ st += 1;
+ }
+ }
+ /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+ if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
+ entropy->dc_context[ci] = 0; /* zero diff category */
+ else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
+ entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+ else
+ entropy->dc_context[ci] = 4 + (sign * 4); /* small diff category */
+ v = m;
+ /* Figure F.24: Decoding the magnitude bit pattern of v */
+ st += 14;
+ while (m >>= 1)
+ if (arith_decode(cinfo, st)) v |= m;
+ v += 1; if (sign) v = -v;
+ entropy->last_dc_val[ci] = (entropy->last_dc_val[ci] + v) & 0xffff;
+ }
+
+ if (block)
+ (*block)[0] = (JCOEF)entropy->last_dc_val[ci];
+
+ /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
+
+ tbl = compptr->ac_tbl_no;
+
+ /* Figure F.20: Decode_AC_coefficients */
+ for (k = 1; k <= DCTSIZE2 - 1; k++) {
+ st = entropy->ac_stats[tbl] + 3 * (k - 1);
+ if (arith_decode(cinfo, st)) break; /* EOB flag */
+ while (arith_decode(cinfo, st + 1) == 0) {
+ st += 3; k++;
+ if (k > DCTSIZE2 - 1) {
+ WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+ entropy->ct = -1; /* spectral overflow */
+ return TRUE;
+ }
+ }
+ /* Figure F.21: Decoding nonzero value v */
+ /* Figure F.22: Decoding the sign of v */
+ sign = arith_decode(cinfo, entropy->fixed_bin);
+ st += 2;
+ /* Figure F.23: Decoding the magnitude category of v */
+ if ((m = arith_decode(cinfo, st)) != 0) {
+ if (arith_decode(cinfo, st)) {
+ m <<= 1;
+ st = entropy->ac_stats[tbl] +
+ (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+ while (arith_decode(cinfo, st)) {
+ if ((m <<= 1) == 0x8000) {
+ WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+ entropy->ct = -1; /* magnitude overflow */
+ return TRUE;
+ }
+ st += 1;
+ }
+ }
+ }
+ v = m;
+ /* Figure F.24: Decoding the magnitude bit pattern of v */
+ st += 14;
+ while (m >>= 1)
+ if (arith_decode(cinfo, st)) v |= m;
+ v += 1; if (sign) v = -v;
+ if (block)
+ (*block)[jpeg_natural_order[k]] = (JCOEF)v;
+ }
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * Initialize for an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass(j_decompress_ptr cinfo)
+{
+ arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
+ int ci, tbl;
+ jpeg_component_info *compptr;
+
+ if (cinfo->progressive_mode) {
+ /* Validate progressive scan parameters */
+ if (cinfo->Ss == 0) {
+ if (cinfo->Se != 0)
+ goto bad;
+ } else {
+ /* need not check Ss/Se < 0 since they came from unsigned bytes */
+ if (cinfo->Se < cinfo->Ss || cinfo->Se > DCTSIZE2 - 1)
+ goto bad;
+ /* AC scans may have only one component */
+ if (cinfo->comps_in_scan != 1)
+ goto bad;
+ }
+ if (cinfo->Ah != 0) {
+ /* Successive approximation refinement scan: must have Al = Ah-1. */
+ if (cinfo->Ah - 1 != cinfo->Al)
+ goto bad;
+ }
+ if (cinfo->Al > 13) { /* need not check for < 0 */
+bad:
+ ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+ cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+ }
+ /* Update progression status, and verify that scan order is legal.
+ * Note that inter-scan inconsistencies are treated as warnings
+ * not fatal errors ... not clear if this is right way to behave.
+ */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
+ int *coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+ int *prev_coef_bit_ptr =
+ &cinfo->coef_bits[cindex + cinfo->num_components][0];
+ if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
+ WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+ for (coefi = MIN(cinfo->Ss, 1); coefi <= MAX(cinfo->Se, 9); coefi++) {
+ if (cinfo->input_scan_number > 1)
+ prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
+ else
+ prev_coef_bit_ptr[coefi] = 0;
+ }
+ for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
+ int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
+ if (cinfo->Ah != expected)
+ WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+ coef_bit_ptr[coefi] = cinfo->Al;
+ }
+ }
+ /* Select MCU decoding routine */
+ if (cinfo->Ah == 0) {
+ if (cinfo->Ss == 0)
+ entropy->pub.decode_mcu = decode_mcu_DC_first;
+ else
+ entropy->pub.decode_mcu = decode_mcu_AC_first;
+ } else {
+ if (cinfo->Ss == 0)
+ entropy->pub.decode_mcu = decode_mcu_DC_refine;
+ else
+ entropy->pub.decode_mcu = decode_mcu_AC_refine;
+ }
+ } else {
+ /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
+ * This ought to be an error condition, but we make it a warning.
+ */
+ if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 ||
+ cinfo->Ah != 0 || cinfo->Al != 0)
+ WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
+ /* Select MCU decoding routine */
+ entropy->pub.decode_mcu = decode_mcu;
+ }
+
+ /* Allocate & initialize requested statistics areas */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+ tbl = compptr->dc_tbl_no;
+ if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+ ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+ if (entropy->dc_stats[tbl] == NULL)
+ entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+ memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
+ /* Initialize DC predictions to 0 */
+ entropy->last_dc_val[ci] = 0;
+ entropy->dc_context[ci] = 0;
+ }
+ if (!cinfo->progressive_mode || cinfo->Ss) {
+ tbl = compptr->ac_tbl_no;
+ if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+ ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+ if (entropy->ac_stats[tbl] == NULL)
+ entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+ memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
+ }
+ }
+
+ /* Initialize arithmetic decoding variables */
+ entropy->c = 0;
+ entropy->a = 0;
+ entropy->ct = -16; /* force reading 2 initial bytes to fill C */
+ entropy->pub.insufficient_data = FALSE;
+
+ /* Initialize restart counter */
+ entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Module initialization routine for arithmetic entropy decoding.
+ */
+
+GLOBAL(void)
+jinit_arith_decoder(j_decompress_ptr cinfo)
+{
+ arith_entropy_ptr entropy;
+ int i;
+
+ entropy = (arith_entropy_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(arith_entropy_decoder));
+ cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
+ entropy->pub.start_pass = start_pass;
+
+ /* Mark tables unallocated */
+ for (i = 0; i < NUM_ARITH_TBLS; i++) {
+ entropy->dc_stats[i] = NULL;
+ entropy->ac_stats[i] = NULL;
+ }
+
+ /* Initialize index for fixed probability estimation */
+ entropy->fixed_bin[0] = 113;
+
+ if (cinfo->progressive_mode) {
+ /* Create progression status table */
+ int *coef_bit_ptr, ci;
+ cinfo->coef_bits = (int (*)[DCTSIZE2])
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ cinfo->num_components * 2 * DCTSIZE2 *
+ sizeof(int));
+ coef_bit_ptr = &cinfo->coef_bits[0][0];
+ for (ci = 0; ci < cinfo->num_components; ci++)
+ for (i = 0; i < DCTSIZE2; i++)
+ *coef_bit_ptr++ = -1;
+ }
+}
diff --git a/media/libjpeg/jdatadst.c b/media/libjpeg/jdatadst.c
new file mode 100644
index 0000000000..6b4fed2339
--- /dev/null
+++ b/media/libjpeg/jdatadst.c
@@ -0,0 +1,287 @@
+/*
+ * jdatadst.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009-2012 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2013, 2016, 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains compression data destination routines for the case of
+ * emitting JPEG data to memory or to a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different destination manager.
+ * IMPORTANT: we assume that fwrite() will correctly transcribe an array of
+ * JOCTETs into 8-bit-wide elements on external storage. If char is wider
+ * than 8 bits on your machine, you may need to do some tweaking.
+ */
+
+/* this is not a core library module, so it doesn't define JPEG_INTERNALS */
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+
+/* Expanded data destination object for stdio output */
+
+typedef struct {
+ struct jpeg_destination_mgr pub; /* public fields */
+
+ FILE *outfile; /* target stream */
+ JOCTET *buffer; /* start of buffer */
+} my_destination_mgr;
+
+typedef my_destination_mgr *my_dest_ptr;
+
+#define OUTPUT_BUF_SIZE 4096 /* choose an efficiently fwrite'able size */
+
+
+#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
+/* Expanded data destination object for memory output */
+
+typedef struct {
+ struct jpeg_destination_mgr pub; /* public fields */
+
+ unsigned char **outbuffer; /* target buffer */
+ unsigned long *outsize;
+ unsigned char *newbuffer; /* newly allocated buffer */
+ JOCTET *buffer; /* start of buffer */
+ size_t bufsize;
+} my_mem_destination_mgr;
+
+typedef my_mem_destination_mgr *my_mem_dest_ptr;
+#endif
+
+
+/*
+ * Initialize destination --- called by jpeg_start_compress
+ * before any data is actually written.
+ */
+
+METHODDEF(void)
+init_destination(j_compress_ptr cinfo)
+{
+ my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
+
+ /* Allocate the output buffer --- it will be released when done with image */
+ dest->buffer = (JOCTET *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ OUTPUT_BUF_SIZE * sizeof(JOCTET));
+
+ dest->pub.next_output_byte = dest->buffer;
+ dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
+}
+
+#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
+METHODDEF(void)
+init_mem_destination(j_compress_ptr cinfo)
+{
+ /* no work necessary here */
+}
+#endif
+
+
+/*
+ * Empty the output buffer --- called whenever buffer fills up.
+ *
+ * In typical applications, this should write the entire output buffer
+ * (ignoring the current state of next_output_byte & free_in_buffer),
+ * reset the pointer & count to the start of the buffer, and return TRUE
+ * indicating that the buffer has been dumped.
+ *
+ * In applications that need to be able to suspend compression due to output
+ * overrun, a FALSE return indicates that the buffer cannot be emptied now.
+ * In this situation, the compressor will return to its caller (possibly with
+ * an indication that it has not accepted all the supplied scanlines). The
+ * application should resume compression after it has made more room in the
+ * output buffer. Note that there are substantial restrictions on the use of
+ * suspension --- see the documentation.
+ *
+ * When suspending, the compressor will back up to a convenient restart point
+ * (typically the start of the current MCU). next_output_byte & free_in_buffer
+ * indicate where the restart point will be if the current call returns FALSE.
+ * Data beyond this point will be regenerated after resumption, so do not
+ * write it out when emptying the buffer externally.
+ */
+
+METHODDEF(boolean)
+empty_output_buffer(j_compress_ptr cinfo)
+{
+ my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
+
+ if (fwrite(dest->buffer, 1, OUTPUT_BUF_SIZE, dest->outfile) !=
+ (size_t)OUTPUT_BUF_SIZE)
+ ERREXIT(cinfo, JERR_FILE_WRITE);
+
+ dest->pub.next_output_byte = dest->buffer;
+ dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
+
+ return TRUE;
+}
+
+#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
+METHODDEF(boolean)
+empty_mem_output_buffer(j_compress_ptr cinfo)
+{
+ size_t nextsize;
+ JOCTET *nextbuffer;
+ my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
+
+ /* Try to allocate new buffer with double size */
+ nextsize = dest->bufsize * 2;
+ nextbuffer = (JOCTET *)malloc(nextsize);
+
+ if (nextbuffer == NULL)
+ ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+
+ memcpy(nextbuffer, dest->buffer, dest->bufsize);
+
+ free(dest->newbuffer);
+
+ dest->newbuffer = nextbuffer;
+
+ dest->pub.next_output_byte = nextbuffer + dest->bufsize;
+ dest->pub.free_in_buffer = dest->bufsize;
+
+ dest->buffer = nextbuffer;
+ dest->bufsize = nextsize;
+
+ return TRUE;
+}
+#endif
+
+
+/*
+ * Terminate destination --- called by jpeg_finish_compress
+ * after all data has been written. Usually needs to flush buffer.
+ *
+ * NB: *not* called by jpeg_abort or jpeg_destroy; surrounding
+ * application must deal with any cleanup that should happen even
+ * for error exit.
+ */
+
+METHODDEF(void)
+term_destination(j_compress_ptr cinfo)
+{
+ my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
+ size_t datacount = OUTPUT_BUF_SIZE - dest->pub.free_in_buffer;
+
+ /* Write any data remaining in the buffer */
+ if (datacount > 0) {
+ if (fwrite(dest->buffer, 1, datacount, dest->outfile) != datacount)
+ ERREXIT(cinfo, JERR_FILE_WRITE);
+ }
+ fflush(dest->outfile);
+ /* Make sure we wrote the output file OK */
+ if (ferror(dest->outfile))
+ ERREXIT(cinfo, JERR_FILE_WRITE);
+}
+
+#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
+METHODDEF(void)
+term_mem_destination(j_compress_ptr cinfo)
+{
+ my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
+
+ *dest->outbuffer = dest->buffer;
+ *dest->outsize = (unsigned long)(dest->bufsize - dest->pub.free_in_buffer);
+}
+#endif
+
+
+/*
+ * Prepare for output to a stdio stream.
+ * The caller must have already opened the stream, and is responsible
+ * for closing it after finishing compression.
+ */
+
+GLOBAL(void)
+jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile)
+{
+ my_dest_ptr dest;
+
+ /* The destination object is made permanent so that multiple JPEG images
+ * can be written to the same file without re-executing jpeg_stdio_dest.
+ */
+ if (cinfo->dest == NULL) { /* first time for this JPEG object? */
+ cinfo->dest = (struct jpeg_destination_mgr *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+ sizeof(my_destination_mgr));
+ } else if (cinfo->dest->init_destination != init_destination) {
+ /* It is unsafe to reuse the existing destination manager unless it was
+ * created by this function. Otherwise, there is no guarantee that the
+ * opaque structure is the right size. Note that we could just create a
+ * new structure, but the old structure would not be freed until
+ * jpeg_destroy_compress() was called.
+ */
+ ERREXIT(cinfo, JERR_BUFFER_SIZE);
+ }
+
+ dest = (my_dest_ptr)cinfo->dest;
+ dest->pub.init_destination = init_destination;
+ dest->pub.empty_output_buffer = empty_output_buffer;
+ dest->pub.term_destination = term_destination;
+ dest->outfile = outfile;
+}
+
+
+#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
+/*
+ * Prepare for output to a memory buffer.
+ * The caller may supply an own initial buffer with appropriate size.
+ * Otherwise, or when the actual data output exceeds the given size,
+ * the library adapts the buffer size as necessary.
+ * The standard library functions malloc/free are used for allocating
+ * larger memory, so the buffer is available to the application after
+ * finishing compression, and then the application is responsible for
+ * freeing the requested memory.
+ * Note: An initial buffer supplied by the caller is expected to be
+ * managed by the application. The library does not free such buffer
+ * when allocating a larger buffer.
+ */
+
+GLOBAL(void)
+jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+ unsigned long *outsize)
+{
+ my_mem_dest_ptr dest;
+
+ if (outbuffer == NULL || outsize == NULL) /* sanity check */
+ ERREXIT(cinfo, JERR_BUFFER_SIZE);
+
+ /* The destination object is made permanent so that multiple JPEG images
+ * can be written to the same buffer without re-executing jpeg_mem_dest.
+ */
+ if (cinfo->dest == NULL) { /* first time for this JPEG object? */
+ cinfo->dest = (struct jpeg_destination_mgr *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+ sizeof(my_mem_destination_mgr));
+ } else if (cinfo->dest->init_destination != init_mem_destination) {
+ /* It is unsafe to reuse the existing destination manager unless it was
+ * created by this function.
+ */
+ ERREXIT(cinfo, JERR_BUFFER_SIZE);
+ }
+
+ dest = (my_mem_dest_ptr)cinfo->dest;
+ dest->pub.init_destination = init_mem_destination;
+ dest->pub.empty_output_buffer = empty_mem_output_buffer;
+ dest->pub.term_destination = term_mem_destination;
+ dest->outbuffer = outbuffer;
+ dest->outsize = outsize;
+ dest->newbuffer = NULL;
+
+ if (*outbuffer == NULL || *outsize == 0) {
+ /* Allocate initial buffer */
+ dest->newbuffer = *outbuffer = (unsigned char *)malloc(OUTPUT_BUF_SIZE);
+ if (dest->newbuffer == NULL)
+ ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+ *outsize = OUTPUT_BUF_SIZE;
+ }
+
+ dest->pub.next_output_byte = dest->buffer = *outbuffer;
+ dest->pub.free_in_buffer = dest->bufsize = *outsize;
+}
+#endif
diff --git a/media/libjpeg/jdatasrc.c b/media/libjpeg/jdatasrc.c
new file mode 100644
index 0000000000..e36a30d894
--- /dev/null
+++ b/media/libjpeg/jdatasrc.c
@@ -0,0 +1,295 @@
+/*
+ * jdatasrc.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009-2011 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2013, 2016, 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains decompression data source routines for the case of
+ * reading JPEG data from memory or from a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different source manager.
+ * IMPORTANT: we assume that fread() will correctly transcribe an array of
+ * JOCTETs from 8-bit-wide elements on external storage. If char is wider
+ * than 8 bits on your machine, you may need to do some tweaking.
+ */
+
+/* this is not a core library module, so it doesn't define JPEG_INTERNALS */
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+
+/* Expanded data source object for stdio input */
+
+typedef struct {
+ struct jpeg_source_mgr pub; /* public fields */
+
+ FILE *infile; /* source stream */
+ JOCTET *buffer; /* start of buffer */
+ boolean start_of_file; /* have we gotten any data yet? */
+} my_source_mgr;
+
+typedef my_source_mgr *my_src_ptr;
+
+#define INPUT_BUF_SIZE 4096 /* choose an efficiently fread'able size */
+
+
+/*
+ * Initialize source --- called by jpeg_read_header
+ * before any data is actually read.
+ */
+
+METHODDEF(void)
+init_source(j_decompress_ptr cinfo)
+{
+ my_src_ptr src = (my_src_ptr)cinfo->src;
+
+ /* We reset the empty-input-file flag for each image,
+ * but we don't clear the input buffer.
+ * This is correct behavior for reading a series of images from one source.
+ */
+ src->start_of_file = TRUE;
+}
+
+#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
+METHODDEF(void)
+init_mem_source(j_decompress_ptr cinfo)
+{
+ /* no work necessary here */
+}
+#endif
+
+
+/*
+ * Fill the input buffer --- called whenever buffer is emptied.
+ *
+ * In typical applications, this should read fresh data into the buffer
+ * (ignoring the current state of next_input_byte & bytes_in_buffer),
+ * reset the pointer & count to the start of the buffer, and return TRUE
+ * indicating that the buffer has been reloaded. It is not necessary to
+ * fill the buffer entirely, only to obtain at least one more byte.
+ *
+ * There is no such thing as an EOF return. If the end of the file has been
+ * reached, the routine has a choice of ERREXIT() or inserting fake data into
+ * the buffer. In most cases, generating a warning message and inserting a
+ * fake EOI marker is the best course of action --- this will allow the
+ * decompressor to output however much of the image is there. However,
+ * the resulting error message is misleading if the real problem is an empty
+ * input file, so we handle that case specially.
+ *
+ * In applications that need to be able to suspend compression due to input
+ * not being available yet, a FALSE return indicates that no more data can be
+ * obtained right now, but more may be forthcoming later. In this situation,
+ * the decompressor will return to its caller (with an indication of the
+ * number of scanlines it has read, if any). The application should resume
+ * decompression after it has loaded more data into the input buffer. Note
+ * that there are substantial restrictions on the use of suspension --- see
+ * the documentation.
+ *
+ * When suspending, the decompressor will back up to a convenient restart point
+ * (typically the start of the current MCU). next_input_byte & bytes_in_buffer
+ * indicate where the restart point will be if the current call returns FALSE.
+ * Data beyond this point must be rescanned after resumption, so move it to
+ * the front of the buffer rather than discarding it.
+ */
+
+METHODDEF(boolean)
+fill_input_buffer(j_decompress_ptr cinfo)
+{
+ my_src_ptr src = (my_src_ptr)cinfo->src;
+ size_t nbytes;
+
+ nbytes = fread(src->buffer, 1, INPUT_BUF_SIZE, src->infile);
+
+ if (nbytes <= 0) {
+ if (src->start_of_file) /* Treat empty input file as fatal error */
+ ERREXIT(cinfo, JERR_INPUT_EMPTY);
+ WARNMS(cinfo, JWRN_JPEG_EOF);
+ /* Insert a fake EOI marker */
+ src->buffer[0] = (JOCTET)0xFF;
+ src->buffer[1] = (JOCTET)JPEG_EOI;
+ nbytes = 2;
+ }
+
+ src->pub.next_input_byte = src->buffer;
+ src->pub.bytes_in_buffer = nbytes;
+ src->start_of_file = FALSE;
+
+ return TRUE;
+}
+
+#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
+METHODDEF(boolean)
+fill_mem_input_buffer(j_decompress_ptr cinfo)
+{
+ static const JOCTET mybuffer[4] = {
+ (JOCTET)0xFF, (JOCTET)JPEG_EOI, 0, 0
+ };
+
+ /* The whole JPEG data is expected to reside in the supplied memory
+ * buffer, so any request for more data beyond the given buffer size
+ * is treated as an error.
+ */
+ WARNMS(cinfo, JWRN_JPEG_EOF);
+
+ /* Insert a fake EOI marker */
+
+ cinfo->src->next_input_byte = mybuffer;
+ cinfo->src->bytes_in_buffer = 2;
+
+ return TRUE;
+}
+#endif
+
+
+/*
+ * Skip data --- used to skip over a potentially large amount of
+ * uninteresting data (such as an APPn marker).
+ *
+ * Writers of suspendable-input applications must note that skip_input_data
+ * is not granted the right to give a suspension return. If the skip extends
+ * beyond the data currently in the buffer, the buffer can be marked empty so
+ * that the next read will cause a fill_input_buffer call that can suspend.
+ * Arranging for additional bytes to be discarded before reloading the input
+ * buffer is the application writer's problem.
+ */
+
+METHODDEF(void)
+skip_input_data(j_decompress_ptr cinfo, long num_bytes)
+{
+ struct jpeg_source_mgr *src = cinfo->src;
+
+ /* Just a dumb implementation for now. Could use fseek() except
+ * it doesn't work on pipes. Not clear that being smart is worth
+ * any trouble anyway --- large skips are infrequent.
+ */
+ if (num_bytes > 0) {
+ while (num_bytes > (long)src->bytes_in_buffer) {
+ num_bytes -= (long)src->bytes_in_buffer;
+ (void)(*src->fill_input_buffer) (cinfo);
+ /* note we assume that fill_input_buffer will never return FALSE,
+ * so suspension need not be handled.
+ */
+ }
+ src->next_input_byte += (size_t)num_bytes;
+ src->bytes_in_buffer -= (size_t)num_bytes;
+ }
+}
+
+
+/*
+ * An additional method that can be provided by data source modules is the
+ * resync_to_restart method for error recovery in the presence of RST markers.
+ * For the moment, this source module just uses the default resync method
+ * provided by the JPEG library. That method assumes that no backtracking
+ * is possible.
+ */
+
+
+/*
+ * Terminate source --- called by jpeg_finish_decompress
+ * after all data has been read. Often a no-op.
+ *
+ * NB: *not* called by jpeg_abort or jpeg_destroy; surrounding
+ * application must deal with any cleanup that should happen even
+ * for error exit.
+ */
+
+METHODDEF(void)
+term_source(j_decompress_ptr cinfo)
+{
+ /* no work necessary here */
+}
+
+
+/*
+ * Prepare for input from a stdio stream.
+ * The caller must have already opened the stream, and is responsible
+ * for closing it after finishing decompression.
+ */
+
+GLOBAL(void)
+jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile)
+{
+ my_src_ptr src;
+
+ /* The source object and input buffer are made permanent so that a series
+ * of JPEG images can be read from the same file by calling jpeg_stdio_src
+ * only before the first one. (If we discarded the buffer at the end of
+ * one image, we'd likely lose the start of the next one.)
+ */
+ if (cinfo->src == NULL) { /* first time for this JPEG object? */
+ cinfo->src = (struct jpeg_source_mgr *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+ sizeof(my_source_mgr));
+ src = (my_src_ptr)cinfo->src;
+ src->buffer = (JOCTET *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+ INPUT_BUF_SIZE * sizeof(JOCTET));
+ } else if (cinfo->src->init_source != init_source) {
+ /* It is unsafe to reuse the existing source manager unless it was created
+ * by this function. Otherwise, there is no guarantee that the opaque
+ * structure is the right size. Note that we could just create a new
+ * structure, but the old structure would not be freed until
+ * jpeg_destroy_decompress() was called.
+ */
+ ERREXIT(cinfo, JERR_BUFFER_SIZE);
+ }
+
+ src = (my_src_ptr)cinfo->src;
+ src->pub.init_source = init_source;
+ src->pub.fill_input_buffer = fill_input_buffer;
+ src->pub.skip_input_data = skip_input_data;
+ src->pub.resync_to_restart = jpeg_resync_to_restart; /* use default method */
+ src->pub.term_source = term_source;
+ src->infile = infile;
+ src->pub.bytes_in_buffer = 0; /* forces fill_input_buffer on first read */
+ src->pub.next_input_byte = NULL; /* until buffer loaded */
+}
+
+
+#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
+/*
+ * Prepare for input from a supplied memory buffer.
+ * The buffer must contain the whole JPEG data.
+ */
+
+GLOBAL(void)
+jpeg_mem_src(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+ unsigned long insize)
+{
+ struct jpeg_source_mgr *src;
+
+ if (inbuffer == NULL || insize == 0) /* Treat empty input as fatal error */
+ ERREXIT(cinfo, JERR_INPUT_EMPTY);
+
+ /* The source object is made permanent so that a series of JPEG images
+ * can be read from the same buffer by calling jpeg_mem_src only before
+ * the first one.
+ */
+ if (cinfo->src == NULL) { /* first time for this JPEG object? */
+ cinfo->src = (struct jpeg_source_mgr *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+ sizeof(struct jpeg_source_mgr));
+ } else if (cinfo->src->init_source != init_mem_source) {
+ /* It is unsafe to reuse the existing source manager unless it was created
+ * by this function.
+ */
+ ERREXIT(cinfo, JERR_BUFFER_SIZE);
+ }
+
+ src = cinfo->src;
+ src->init_source = init_mem_source;
+ src->fill_input_buffer = fill_mem_input_buffer;
+ src->skip_input_data = skip_input_data;
+ src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
+ src->term_source = term_source;
+ src->bytes_in_buffer = (size_t)insize;
+ src->next_input_byte = (const JOCTET *)inbuffer;
+}
+#endif
diff --git a/media/libjpeg/jdcoefct.c b/media/libjpeg/jdcoefct.c
new file mode 100644
index 0000000000..88e10c08cb
--- /dev/null
+++ b/media/libjpeg/jdcoefct.c
@@ -0,0 +1,878 @@
+/*
+ * jdcoefct.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2010, 2015-2016, 2019-2020, 2022, D. R. Commander.
+ * Copyright (C) 2015, 2020, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the coefficient buffer controller for decompression.
+ * This controller is the top level of the JPEG decompressor proper.
+ * The coefficient buffer lies between entropy decoding and inverse-DCT steps.
+ *
+ * In buffered-image mode, this controller is the interface between
+ * input-oriented processing and output-oriented processing.
+ * Also, the input side (only) is used when reading a file for transcoding.
+ */
+
+#include "jinclude.h"
+#include "jdcoefct.h"
+#include "jpegcomp.h"
+
+
+/* Forward declarations */
+METHODDEF(int) decompress_onepass(j_decompress_ptr cinfo,
+ JSAMPIMAGE output_buf);
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+METHODDEF(int) decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+#endif
+#ifdef BLOCK_SMOOTHING_SUPPORTED
+LOCAL(boolean) smoothing_ok(j_decompress_ptr cinfo);
+METHODDEF(int) decompress_smooth_data(j_decompress_ptr cinfo,
+ JSAMPIMAGE output_buf);
+#endif
+
+
+/*
+ * Initialize for an input processing pass.
+ */
+
+METHODDEF(void)
+start_input_pass(j_decompress_ptr cinfo)
+{
+ cinfo->input_iMCU_row = 0;
+ start_iMCU_row(cinfo);
+}
+
+
+/*
+ * Initialize for an output processing pass.
+ */
+
+METHODDEF(void)
+start_output_pass(j_decompress_ptr cinfo)
+{
+#ifdef BLOCK_SMOOTHING_SUPPORTED
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+
+ /* If multipass, check to see whether to use block smoothing on this pass */
+ if (coef->pub.coef_arrays != NULL) {
+ if (cinfo->do_block_smoothing && smoothing_ok(cinfo))
+ coef->pub.decompress_data = decompress_smooth_data;
+ else
+ coef->pub.decompress_data = decompress_data;
+ }
+#endif
+ cinfo->output_iMCU_row = 0;
+}
+
+
+/*
+ * Decompress and return some data in the single-pass case.
+ * Always attempts to emit one fully interleaved MCU row ("iMCU" row).
+ * Input and output must run in lockstep since we have only a one-MCU buffer.
+ * Return value is JPEG_ROW_COMPLETED, JPEG_SCAN_COMPLETED, or JPEG_SUSPENDED.
+ *
+ * NB: output_buf contains a plane for each component in image,
+ * which we index according to the component's SOF position.
+ */
+
+METHODDEF(int)
+decompress_onepass(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+ JDIMENSION MCU_col_num; /* index of current MCU within row */
+ JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
+ JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
+ int blkn, ci, xindex, yindex, yoffset, useful_width;
+ JSAMPARRAY output_ptr;
+ JDIMENSION start_col, output_col;
+ jpeg_component_info *compptr;
+ inverse_DCT_method_ptr inverse_DCT;
+
+ /* Loop to process as much as one whole iMCU row */
+ for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
+ yoffset++) {
+ for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
+ MCU_col_num++) {
+ /* Try to fetch an MCU. Entropy decoder expects buffer to be zeroed. */
+ jzero_far((void *)coef->MCU_buffer[0],
+ (size_t)(cinfo->blocks_in_MCU * sizeof(JBLOCK)));
+ if (!cinfo->entropy->insufficient_data)
+ cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
+ if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
+ /* Suspension forced; update state counters and exit */
+ coef->MCU_vert_offset = yoffset;
+ coef->MCU_ctr = MCU_col_num;
+ return JPEG_SUSPENDED;
+ }
+
+ /* Only perform the IDCT on blocks that are contained within the desired
+ * cropping region.
+ */
+ if (MCU_col_num >= cinfo->master->first_iMCU_col &&
+ MCU_col_num <= cinfo->master->last_iMCU_col) {
+ /* Determine where data should go in output_buf and do the IDCT thing.
+ * We skip dummy blocks at the right and bottom edges (but blkn gets
+ * incremented past them!). Note the inner loop relies on having
+ * allocated the MCU_buffer[] blocks sequentially.
+ */
+ blkn = 0; /* index of current DCT block within MCU */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ /* Don't bother to IDCT an uninteresting component. */
+ if (!compptr->component_needed) {
+ blkn += compptr->MCU_blocks;
+ continue;
+ }
+ inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
+ useful_width = (MCU_col_num < last_MCU_col) ?
+ compptr->MCU_width : compptr->last_col_width;
+ output_ptr = output_buf[compptr->component_index] +
+ yoffset * compptr->_DCT_scaled_size;
+ start_col = (MCU_col_num - cinfo->master->first_iMCU_col) *
+ compptr->MCU_sample_width;
+ for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+ if (cinfo->input_iMCU_row < last_iMCU_row ||
+ yoffset + yindex < compptr->last_row_height) {
+ output_col = start_col;
+ for (xindex = 0; xindex < useful_width; xindex++) {
+ (*inverse_DCT) (cinfo, compptr,
+ (JCOEFPTR)coef->MCU_buffer[blkn + xindex],
+ output_ptr, output_col);
+ output_col += compptr->_DCT_scaled_size;
+ }
+ }
+ blkn += compptr->MCU_width;
+ output_ptr += compptr->_DCT_scaled_size;
+ }
+ }
+ }
+ }
+ /* Completed an MCU row, but perhaps not an iMCU row */
+ coef->MCU_ctr = 0;
+ }
+ /* Completed the iMCU row, advance counters for next one */
+ cinfo->output_iMCU_row++;
+ if (++(cinfo->input_iMCU_row) < cinfo->total_iMCU_rows) {
+ start_iMCU_row(cinfo);
+ return JPEG_ROW_COMPLETED;
+ }
+ /* Completed the scan */
+ (*cinfo->inputctl->finish_input_pass) (cinfo);
+ return JPEG_SCAN_COMPLETED;
+}
+
+
+/*
+ * Dummy consume-input routine for single-pass operation.
+ */
+
+METHODDEF(int)
+dummy_consume_data(j_decompress_ptr cinfo)
+{
+ return JPEG_SUSPENDED; /* Always indicate nothing was done */
+}
+
+
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+
+/*
+ * Consume input data and store it in the full-image coefficient buffer.
+ * We read as much as one fully interleaved MCU row ("iMCU" row) per call,
+ * ie, v_samp_factor block rows for each component in the scan.
+ * Return value is JPEG_ROW_COMPLETED, JPEG_SCAN_COMPLETED, or JPEG_SUSPENDED.
+ */
+
+METHODDEF(int)
+consume_data(j_decompress_ptr cinfo)
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+ JDIMENSION MCU_col_num; /* index of current MCU within row */
+ int blkn, ci, xindex, yindex, yoffset;
+ JDIMENSION start_col;
+ JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN];
+ JBLOCKROW buffer_ptr;
+ jpeg_component_info *compptr;
+
+ /* Align the virtual buffers for the components used in this scan. */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ buffer[ci] = (*cinfo->mem->access_virt_barray)
+ ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
+ cinfo->input_iMCU_row * compptr->v_samp_factor,
+ (JDIMENSION)compptr->v_samp_factor, TRUE);
+ /* Note: entropy decoder expects buffer to be zeroed,
+ * but this is handled automatically by the memory manager
+ * because we requested a pre-zeroed array.
+ */
+ }
+
+ /* Loop to process one whole iMCU row */
+ for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
+ yoffset++) {
+ for (MCU_col_num = coef->MCU_ctr; MCU_col_num < cinfo->MCUs_per_row;
+ MCU_col_num++) {
+ /* Construct list of pointers to DCT blocks belonging to this MCU */
+ blkn = 0; /* index of current DCT block within MCU */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ start_col = MCU_col_num * compptr->MCU_width;
+ for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+ buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
+ for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
+ coef->MCU_buffer[blkn++] = buffer_ptr++;
+ }
+ }
+ }
+ if (!cinfo->entropy->insufficient_data)
+ cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
+ /* Try to fetch the MCU. */
+ if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
+ /* Suspension forced; update state counters and exit */
+ coef->MCU_vert_offset = yoffset;
+ coef->MCU_ctr = MCU_col_num;
+ return JPEG_SUSPENDED;
+ }
+ }
+ /* Completed an MCU row, but perhaps not an iMCU row */
+ coef->MCU_ctr = 0;
+ }
+ /* Completed the iMCU row, advance counters for next one */
+ if (++(cinfo->input_iMCU_row) < cinfo->total_iMCU_rows) {
+ start_iMCU_row(cinfo);
+ return JPEG_ROW_COMPLETED;
+ }
+ /* Completed the scan */
+ (*cinfo->inputctl->finish_input_pass) (cinfo);
+ return JPEG_SCAN_COMPLETED;
+}
+
+
+/*
+ * Decompress and return some data in the multi-pass case.
+ * Always attempts to emit one fully interleaved MCU row ("iMCU" row).
+ * Return value is JPEG_ROW_COMPLETED, JPEG_SCAN_COMPLETED, or JPEG_SUSPENDED.
+ *
+ * NB: output_buf contains a plane for each component in image.
+ */
+
+METHODDEF(int)
+decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+ JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
+ JDIMENSION block_num;
+ int ci, block_row, block_rows;
+ JBLOCKARRAY buffer;
+ JBLOCKROW buffer_ptr;
+ JSAMPARRAY output_ptr;
+ JDIMENSION output_col;
+ jpeg_component_info *compptr;
+ inverse_DCT_method_ptr inverse_DCT;
+
+ /* Force some input to be done if we are getting ahead of the input. */
+ while (cinfo->input_scan_number < cinfo->output_scan_number ||
+ (cinfo->input_scan_number == cinfo->output_scan_number &&
+ cinfo->input_iMCU_row <= cinfo->output_iMCU_row)) {
+ if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
+ return JPEG_SUSPENDED;
+ }
+
+ /* OK, output from the virtual arrays. */
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ /* Don't bother to IDCT an uninteresting component. */
+ if (!compptr->component_needed)
+ continue;
+ /* Align the virtual buffer for this component. */
+ buffer = (*cinfo->mem->access_virt_barray)
+ ((j_common_ptr)cinfo, coef->whole_image[ci],
+ cinfo->output_iMCU_row * compptr->v_samp_factor,
+ (JDIMENSION)compptr->v_samp_factor, FALSE);
+ /* Count non-dummy DCT block rows in this iMCU row. */
+ if (cinfo->output_iMCU_row < last_iMCU_row)
+ block_rows = compptr->v_samp_factor;
+ else {
+ /* NB: can't use last_row_height here; it is input-side-dependent! */
+ block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
+ if (block_rows == 0) block_rows = compptr->v_samp_factor;
+ }
+ inverse_DCT = cinfo->idct->inverse_DCT[ci];
+ output_ptr = output_buf[ci];
+ /* Loop over all DCT blocks to be processed. */
+ for (block_row = 0; block_row < block_rows; block_row++) {
+ buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci];
+ output_col = 0;
+ for (block_num = cinfo->master->first_MCU_col[ci];
+ block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
+ (*inverse_DCT) (cinfo, compptr, (JCOEFPTR)buffer_ptr, output_ptr,
+ output_col);
+ buffer_ptr++;
+ output_col += compptr->_DCT_scaled_size;
+ }
+ output_ptr += compptr->_DCT_scaled_size;
+ }
+ }
+
+ if (++(cinfo->output_iMCU_row) < cinfo->total_iMCU_rows)
+ return JPEG_ROW_COMPLETED;
+ return JPEG_SCAN_COMPLETED;
+}
+
+#endif /* D_MULTISCAN_FILES_SUPPORTED */
+
+
+#ifdef BLOCK_SMOOTHING_SUPPORTED
+
+/*
+ * This code applies interblock smoothing; the first 9 AC coefficients are
+ * estimated from the DC values of a DCT block and its 24 neighboring blocks.
+ * We apply smoothing only for progressive JPEG decoding, and only if
+ * the coefficients it can estimate are not yet known to full precision.
+ */
+
+/* Natural-order array positions of the first 9 zigzag-order coefficients */
+#define Q01_POS 1
+#define Q10_POS 8
+#define Q20_POS 16
+#define Q11_POS 9
+#define Q02_POS 2
+#define Q03_POS 3
+#define Q12_POS 10
+#define Q21_POS 17
+#define Q30_POS 24
+
+/*
+ * Determine whether block smoothing is applicable and safe.
+ * We also latch the current states of the coef_bits[] entries for the
+ * AC coefficients; otherwise, if the input side of the decompressor
+ * advances into a new scan, we might think the coefficients are known
+ * more accurately than they really are.
+ */
+
+LOCAL(boolean)
+smoothing_ok(j_decompress_ptr cinfo)
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+ boolean smoothing_useful = FALSE;
+ int ci, coefi;
+ jpeg_component_info *compptr;
+ JQUANT_TBL *qtable;
+ int *coef_bits, *prev_coef_bits;
+ int *coef_bits_latch, *prev_coef_bits_latch;
+
+ if (!cinfo->progressive_mode || cinfo->coef_bits == NULL)
+ return FALSE;
+
+ /* Allocate latch area if not already done */
+ if (coef->coef_bits_latch == NULL)
+ coef->coef_bits_latch = (int *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ cinfo->num_components * 2 *
+ (SAVED_COEFS * sizeof(int)));
+ coef_bits_latch = coef->coef_bits_latch;
+ prev_coef_bits_latch =
+ &coef->coef_bits_latch[cinfo->num_components * SAVED_COEFS];
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ /* All components' quantization values must already be latched. */
+ if ((qtable = compptr->quant_table) == NULL)
+ return FALSE;
+ /* Verify DC & first 9 AC quantizers are nonzero to avoid zero-divide. */
+ if (qtable->quantval[0] == 0 ||
+ qtable->quantval[Q01_POS] == 0 ||
+ qtable->quantval[Q10_POS] == 0 ||
+ qtable->quantval[Q20_POS] == 0 ||
+ qtable->quantval[Q11_POS] == 0 ||
+ qtable->quantval[Q02_POS] == 0 ||
+ qtable->quantval[Q03_POS] == 0 ||
+ qtable->quantval[Q12_POS] == 0 ||
+ qtable->quantval[Q21_POS] == 0 ||
+ qtable->quantval[Q30_POS] == 0)
+ return FALSE;
+ /* DC values must be at least partly known for all components. */
+ coef_bits = cinfo->coef_bits[ci];
+ prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
+ if (coef_bits[0] < 0)
+ return FALSE;
+ coef_bits_latch[0] = coef_bits[0];
+ /* Block smoothing is helpful if some AC coefficients remain inaccurate. */
+ for (coefi = 1; coefi < SAVED_COEFS; coefi++) {
+ if (cinfo->input_scan_number > 1)
+ prev_coef_bits_latch[coefi] = prev_coef_bits[coefi];
+ else
+ prev_coef_bits_latch[coefi] = -1;
+ coef_bits_latch[coefi] = coef_bits[coefi];
+ if (coef_bits[coefi] != 0)
+ smoothing_useful = TRUE;
+ }
+ coef_bits_latch += SAVED_COEFS;
+ prev_coef_bits_latch += SAVED_COEFS;
+ }
+
+ return smoothing_useful;
+}
+
+
+/*
+ * Variant of decompress_data for use when doing block smoothing.
+ */
+
+METHODDEF(int)
+decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+ JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
+ JDIMENSION block_num, last_block_column;
+ int ci, block_row, block_rows, access_rows;
+ JBLOCKARRAY buffer;
+ JBLOCKROW buffer_ptr, prev_prev_block_row, prev_block_row;
+ JBLOCKROW next_block_row, next_next_block_row;
+ JSAMPARRAY output_ptr;
+ JDIMENSION output_col;
+ jpeg_component_info *compptr;
+ inverse_DCT_method_ptr inverse_DCT;
+ boolean change_dc;
+ JCOEF *workspace;
+ int *coef_bits;
+ JQUANT_TBL *quanttbl;
+ JLONG Q00, Q01, Q02, Q03 = 0, Q10, Q11, Q12 = 0, Q20, Q21 = 0, Q30 = 0, num;
+ int DC01, DC02, DC03, DC04, DC05, DC06, DC07, DC08, DC09, DC10, DC11, DC12,
+ DC13, DC14, DC15, DC16, DC17, DC18, DC19, DC20, DC21, DC22, DC23, DC24,
+ DC25;
+ int Al, pred;
+
+ /* Keep a local variable to avoid looking it up more than once */
+ workspace = coef->workspace;
+
+ /* Force some input to be done if we are getting ahead of the input. */
+ while (cinfo->input_scan_number <= cinfo->output_scan_number &&
+ !cinfo->inputctl->eoi_reached) {
+ if (cinfo->input_scan_number == cinfo->output_scan_number) {
+ /* If input is working on current scan, we ordinarily want it to
+ * have completed the current row. But if input scan is DC,
+ * we want it to keep two rows ahead so that next two block rows' DC
+ * values are up to date.
+ */
+ JDIMENSION delta = (cinfo->Ss == 0) ? 2 : 0;
+ if (cinfo->input_iMCU_row > cinfo->output_iMCU_row + delta)
+ break;
+ }
+ if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
+ return JPEG_SUSPENDED;
+ }
+
+ /* OK, output from the virtual arrays. */
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ /* Don't bother to IDCT an uninteresting component. */
+ if (!compptr->component_needed)
+ continue;
+ /* Count non-dummy DCT block rows in this iMCU row. */
+ if (cinfo->output_iMCU_row + 1 < last_iMCU_row) {
+ block_rows = compptr->v_samp_factor;
+ access_rows = block_rows * 3; /* this and next two iMCU rows */
+ } else if (cinfo->output_iMCU_row < last_iMCU_row) {
+ block_rows = compptr->v_samp_factor;
+ access_rows = block_rows * 2; /* this and next iMCU row */
+ } else {
+ /* NB: can't use last_row_height here; it is input-side-dependent! */
+ block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
+ if (block_rows == 0) block_rows = compptr->v_samp_factor;
+ access_rows = block_rows; /* this iMCU row only */
+ }
+ /* Align the virtual buffer for this component. */
+ if (cinfo->output_iMCU_row > 1) {
+ access_rows += 2 * compptr->v_samp_factor; /* prior two iMCU rows too */
+ buffer = (*cinfo->mem->access_virt_barray)
+ ((j_common_ptr)cinfo, coef->whole_image[ci],
+ (cinfo->output_iMCU_row - 2) * compptr->v_samp_factor,
+ (JDIMENSION)access_rows, FALSE);
+ buffer += 2 * compptr->v_samp_factor; /* point to current iMCU row */
+ } else if (cinfo->output_iMCU_row > 0) {
+ buffer = (*cinfo->mem->access_virt_barray)
+ ((j_common_ptr)cinfo, coef->whole_image[ci],
+ (cinfo->output_iMCU_row - 1) * compptr->v_samp_factor,
+ (JDIMENSION)access_rows, FALSE);
+ buffer += compptr->v_samp_factor; /* point to current iMCU row */
+ } else {
+ buffer = (*cinfo->mem->access_virt_barray)
+ ((j_common_ptr)cinfo, coef->whole_image[ci],
+ (JDIMENSION)0, (JDIMENSION)access_rows, FALSE);
+ }
+ /* Fetch component-dependent info.
+ * If the current scan is incomplete, then we use the component-dependent
+ * info from the previous scan.
+ */
+ if (cinfo->output_iMCU_row > cinfo->master->last_good_iMCU_row)
+ coef_bits =
+ coef->coef_bits_latch + ((ci + cinfo->num_components) * SAVED_COEFS);
+ else
+ coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+
+ /* We only do DC interpolation if no AC coefficient data is available. */
+ change_dc =
+ coef_bits[1] == -1 && coef_bits[2] == -1 && coef_bits[3] == -1 &&
+ coef_bits[4] == -1 && coef_bits[5] == -1 && coef_bits[6] == -1 &&
+ coef_bits[7] == -1 && coef_bits[8] == -1 && coef_bits[9] == -1;
+
+ quanttbl = compptr->quant_table;
+ Q00 = quanttbl->quantval[0];
+ Q01 = quanttbl->quantval[Q01_POS];
+ Q10 = quanttbl->quantval[Q10_POS];
+ Q20 = quanttbl->quantval[Q20_POS];
+ Q11 = quanttbl->quantval[Q11_POS];
+ Q02 = quanttbl->quantval[Q02_POS];
+ if (change_dc) {
+ Q03 = quanttbl->quantval[Q03_POS];
+ Q12 = quanttbl->quantval[Q12_POS];
+ Q21 = quanttbl->quantval[Q21_POS];
+ Q30 = quanttbl->quantval[Q30_POS];
+ }
+ inverse_DCT = cinfo->idct->inverse_DCT[ci];
+ output_ptr = output_buf[ci];
+ /* Loop over all DCT blocks to be processed. */
+ for (block_row = 0; block_row < block_rows; block_row++) {
+ buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci];
+
+ if (block_row > 0 || cinfo->output_iMCU_row > 0)
+ prev_block_row =
+ buffer[block_row - 1] + cinfo->master->first_MCU_col[ci];
+ else
+ prev_block_row = buffer_ptr;
+
+ if (block_row > 1 || cinfo->output_iMCU_row > 1)
+ prev_prev_block_row =
+ buffer[block_row - 2] + cinfo->master->first_MCU_col[ci];
+ else
+ prev_prev_block_row = prev_block_row;
+
+ if (block_row < block_rows - 1 || cinfo->output_iMCU_row < last_iMCU_row)
+ next_block_row =
+ buffer[block_row + 1] + cinfo->master->first_MCU_col[ci];
+ else
+ next_block_row = buffer_ptr;
+
+ if (block_row < block_rows - 2 ||
+ cinfo->output_iMCU_row + 1 < last_iMCU_row)
+ next_next_block_row =
+ buffer[block_row + 2] + cinfo->master->first_MCU_col[ci];
+ else
+ next_next_block_row = next_block_row;
+
+ /* We fetch the surrounding DC values using a sliding-register approach.
+ * Initialize all 25 here so as to do the right thing on narrow pics.
+ */
+ DC01 = DC02 = DC03 = DC04 = DC05 = (int)prev_prev_block_row[0][0];
+ DC06 = DC07 = DC08 = DC09 = DC10 = (int)prev_block_row[0][0];
+ DC11 = DC12 = DC13 = DC14 = DC15 = (int)buffer_ptr[0][0];
+ DC16 = DC17 = DC18 = DC19 = DC20 = (int)next_block_row[0][0];
+ DC21 = DC22 = DC23 = DC24 = DC25 = (int)next_next_block_row[0][0];
+ output_col = 0;
+ last_block_column = compptr->width_in_blocks - 1;
+ for (block_num = cinfo->master->first_MCU_col[ci];
+ block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
+ /* Fetch current DCT block into workspace so we can modify it. */
+ jcopy_block_row(buffer_ptr, (JBLOCKROW)workspace, (JDIMENSION)1);
+ /* Update DC values */
+ if (block_num == cinfo->master->first_MCU_col[ci] &&
+ block_num < last_block_column) {
+ DC04 = (int)prev_prev_block_row[1][0];
+ DC09 = (int)prev_block_row[1][0];
+ DC14 = (int)buffer_ptr[1][0];
+ DC19 = (int)next_block_row[1][0];
+ DC24 = (int)next_next_block_row[1][0];
+ }
+ if (block_num + 1 < last_block_column) {
+ DC05 = (int)prev_prev_block_row[2][0];
+ DC10 = (int)prev_block_row[2][0];
+ DC15 = (int)buffer_ptr[2][0];
+ DC20 = (int)next_block_row[2][0];
+ DC25 = (int)next_next_block_row[2][0];
+ }
+ /* If DC interpolation is enabled, compute coefficient estimates using
+ * a Gaussian-like kernel, keeping the averages of the DC values.
+ *
+ * If DC interpolation is disabled, compute coefficient estimates using
+ * an algorithm similar to the one described in Section K.8 of the JPEG
+ * standard, except applied to a 5x5 window rather than a 3x3 window.
+ *
+ * An estimate is applied only if the coefficient is still zero and is
+ * not known to be fully accurate.
+ */
+ /* AC01 */
+ if ((Al = coef_bits[1]) != 0 && workspace[1] == 0) {
+ num = Q00 * (change_dc ?
+ (-DC01 - DC02 + DC04 + DC05 - 3 * DC06 + 13 * DC07 -
+ 13 * DC09 + 3 * DC10 - 3 * DC11 + 38 * DC12 - 38 * DC14 +
+ 3 * DC15 - 3 * DC16 + 13 * DC17 - 13 * DC19 + 3 * DC20 -
+ DC21 - DC22 + DC24 + DC25) :
+ (-7 * DC11 + 50 * DC12 - 50 * DC14 + 7 * DC15));
+ if (num >= 0) {
+ pred = (int)(((Q01 << 7) + num) / (Q01 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q01 << 7) - num) / (Q01 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[1] = (JCOEF)pred;
+ }
+ /* AC10 */
+ if ((Al = coef_bits[2]) != 0 && workspace[8] == 0) {
+ num = Q00 * (change_dc ?
+ (-DC01 - 3 * DC02 - 3 * DC03 - 3 * DC04 - DC05 - DC06 +
+ 13 * DC07 + 38 * DC08 + 13 * DC09 - DC10 + DC16 -
+ 13 * DC17 - 38 * DC18 - 13 * DC19 + DC20 + DC21 +
+ 3 * DC22 + 3 * DC23 + 3 * DC24 + DC25) :
+ (-7 * DC03 + 50 * DC08 - 50 * DC18 + 7 * DC23));
+ if (num >= 0) {
+ pred = (int)(((Q10 << 7) + num) / (Q10 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q10 << 7) - num) / (Q10 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[8] = (JCOEF)pred;
+ }
+ /* AC20 */
+ if ((Al = coef_bits[3]) != 0 && workspace[16] == 0) {
+ num = Q00 * (change_dc ?
+ (DC03 + 2 * DC07 + 7 * DC08 + 2 * DC09 - 5 * DC12 - 14 * DC13 -
+ 5 * DC14 + 2 * DC17 + 7 * DC18 + 2 * DC19 + DC23) :
+ (-DC03 + 13 * DC08 - 24 * DC13 + 13 * DC18 - DC23));
+ if (num >= 0) {
+ pred = (int)(((Q20 << 7) + num) / (Q20 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q20 << 7) - num) / (Q20 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[16] = (JCOEF)pred;
+ }
+ /* AC11 */
+ if ((Al = coef_bits[4]) != 0 && workspace[9] == 0) {
+ num = Q00 * (change_dc ?
+ (-DC01 + DC05 + 9 * DC07 - 9 * DC09 - 9 * DC17 +
+ 9 * DC19 + DC21 - DC25) :
+ (DC10 + DC16 - 10 * DC17 + 10 * DC19 - DC02 - DC20 + DC22 -
+ DC24 + DC04 - DC06 + 10 * DC07 - 10 * DC09));
+ if (num >= 0) {
+ pred = (int)(((Q11 << 7) + num) / (Q11 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q11 << 7) - num) / (Q11 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[9] = (JCOEF)pred;
+ }
+ /* AC02 */
+ if ((Al = coef_bits[5]) != 0 && workspace[2] == 0) {
+ num = Q00 * (change_dc ?
+ (2 * DC07 - 5 * DC08 + 2 * DC09 + DC11 + 7 * DC12 - 14 * DC13 +
+ 7 * DC14 + DC15 + 2 * DC17 - 5 * DC18 + 2 * DC19) :
+ (-DC11 + 13 * DC12 - 24 * DC13 + 13 * DC14 - DC15));
+ if (num >= 0) {
+ pred = (int)(((Q02 << 7) + num) / (Q02 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q02 << 7) - num) / (Q02 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[2] = (JCOEF)pred;
+ }
+ if (change_dc) {
+ /* AC03 */
+ if ((Al = coef_bits[6]) != 0 && workspace[3] == 0) {
+ num = Q00 * (DC07 - DC09 + 2 * DC12 - 2 * DC14 + DC17 - DC19);
+ if (num >= 0) {
+ pred = (int)(((Q03 << 7) + num) / (Q03 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q03 << 7) - num) / (Q03 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[3] = (JCOEF)pred;
+ }
+ /* AC12 */
+ if ((Al = coef_bits[7]) != 0 && workspace[10] == 0) {
+ num = Q00 * (DC07 - 3 * DC08 + DC09 - DC17 + 3 * DC18 - DC19);
+ if (num >= 0) {
+ pred = (int)(((Q12 << 7) + num) / (Q12 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q12 << 7) - num) / (Q12 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[10] = (JCOEF)pred;
+ }
+ /* AC21 */
+ if ((Al = coef_bits[8]) != 0 && workspace[17] == 0) {
+ num = Q00 * (DC07 - DC09 - 3 * DC12 + 3 * DC14 + DC17 - DC19);
+ if (num >= 0) {
+ pred = (int)(((Q21 << 7) + num) / (Q21 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q21 << 7) - num) / (Q21 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[17] = (JCOEF)pred;
+ }
+ /* AC30 */
+ if ((Al = coef_bits[9]) != 0 && workspace[24] == 0) {
+ num = Q00 * (DC07 + 2 * DC08 + DC09 - DC17 - 2 * DC18 - DC19);
+ if (num >= 0) {
+ pred = (int)(((Q30 << 7) + num) / (Q30 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ } else {
+ pred = (int)(((Q30 << 7) - num) / (Q30 << 8));
+ if (Al > 0 && pred >= (1 << Al))
+ pred = (1 << Al) - 1;
+ pred = -pred;
+ }
+ workspace[24] = (JCOEF)pred;
+ }
+ /* coef_bits[0] is non-negative. Otherwise this function would not
+ * be called.
+ */
+ num = Q00 *
+ (-2 * DC01 - 6 * DC02 - 8 * DC03 - 6 * DC04 - 2 * DC05 -
+ 6 * DC06 + 6 * DC07 + 42 * DC08 + 6 * DC09 - 6 * DC10 -
+ 8 * DC11 + 42 * DC12 + 152 * DC13 + 42 * DC14 - 8 * DC15 -
+ 6 * DC16 + 6 * DC17 + 42 * DC18 + 6 * DC19 - 6 * DC20 -
+ 2 * DC21 - 6 * DC22 - 8 * DC23 - 6 * DC24 - 2 * DC25);
+ if (num >= 0) {
+ pred = (int)(((Q00 << 7) + num) / (Q00 << 8));
+ } else {
+ pred = (int)(((Q00 << 7) - num) / (Q00 << 8));
+ pred = -pred;
+ }
+ workspace[0] = (JCOEF)pred;
+ } /* change_dc */
+
+ /* OK, do the IDCT */
+ (*inverse_DCT) (cinfo, compptr, (JCOEFPTR)workspace, output_ptr,
+ output_col);
+ /* Advance for next column */
+ DC01 = DC02; DC02 = DC03; DC03 = DC04; DC04 = DC05;
+ DC06 = DC07; DC07 = DC08; DC08 = DC09; DC09 = DC10;
+ DC11 = DC12; DC12 = DC13; DC13 = DC14; DC14 = DC15;
+ DC16 = DC17; DC17 = DC18; DC18 = DC19; DC19 = DC20;
+ DC21 = DC22; DC22 = DC23; DC23 = DC24; DC24 = DC25;
+ buffer_ptr++, prev_block_row++, next_block_row++,
+ prev_prev_block_row++, next_next_block_row++;
+ output_col += compptr->_DCT_scaled_size;
+ }
+ output_ptr += compptr->_DCT_scaled_size;
+ }
+ }
+
+ if (++(cinfo->output_iMCU_row) < cinfo->total_iMCU_rows)
+ return JPEG_ROW_COMPLETED;
+ return JPEG_SCAN_COMPLETED;
+}
+
+#endif /* BLOCK_SMOOTHING_SUPPORTED */
+
+
+/*
+ * Initialize coefficient buffer controller.
+ */
+
+GLOBAL(void)
+jinit_d_coef_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
+{
+ my_coef_ptr coef;
+
+ coef = (my_coef_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_coef_controller));
+ cinfo->coef = (struct jpeg_d_coef_controller *)coef;
+ coef->pub.start_input_pass = start_input_pass;
+ coef->pub.start_output_pass = start_output_pass;
+#ifdef BLOCK_SMOOTHING_SUPPORTED
+ coef->coef_bits_latch = NULL;
+#endif
+
+ /* Create the coefficient buffer. */
+ if (need_full_buffer) {
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+ /* Allocate a full-image virtual array for each component, */
+ /* padded to a multiple of samp_factor DCT blocks in each direction. */
+ /* Note we ask for a pre-zeroed array. */
+ int ci, access_rows;
+ jpeg_component_info *compptr;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ access_rows = compptr->v_samp_factor;
+#ifdef BLOCK_SMOOTHING_SUPPORTED
+ /* If block smoothing could be used, need a bigger window */
+ if (cinfo->progressive_mode)
+ access_rows *= 5;
+#endif
+ coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, TRUE,
+ (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+ (long)compptr->h_samp_factor),
+ (JDIMENSION)jround_up((long)compptr->height_in_blocks,
+ (long)compptr->v_samp_factor),
+ (JDIMENSION)access_rows);
+ }
+ coef->pub.consume_data = consume_data;
+ coef->pub.decompress_data = decompress_data;
+ coef->pub.coef_arrays = coef->whole_image; /* link to virtual arrays */
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+ } else {
+ /* We only need a single-MCU buffer. */
+ JBLOCKROW buffer;
+ int i;
+
+ buffer = (JBLOCKROW)
+ (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ D_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
+ for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
+ coef->MCU_buffer[i] = buffer + i;
+ }
+ coef->pub.consume_data = dummy_consume_data;
+ coef->pub.decompress_data = decompress_onepass;
+ coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */
+ }
+
+ /* Allocate the workspace buffer */
+ coef->workspace = (JCOEF *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(JCOEF) * DCTSIZE2);
+}
diff --git a/media/libjpeg/jdcoefct.h b/media/libjpeg/jdcoefct.h
new file mode 100644
index 0000000000..9a0e780663
--- /dev/null
+++ b/media/libjpeg/jdcoefct.h
@@ -0,0 +1,83 @@
+/*
+ * jdcoefct.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2020, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#define JPEG_INTERNALS
+#include "jpeglib.h"
+
+
+/* Block smoothing is only applicable for progressive JPEG, so: */
+#ifndef D_PROGRESSIVE_SUPPORTED
+#undef BLOCK_SMOOTHING_SUPPORTED
+#endif
+
+
+/* Private buffer controller object */
+
+typedef struct {
+ struct jpeg_d_coef_controller pub; /* public fields */
+
+ /* These variables keep track of the current location of the input side. */
+ /* cinfo->input_iMCU_row is also used for this. */
+ JDIMENSION MCU_ctr; /* counts MCUs processed in current row */
+ int MCU_vert_offset; /* counts MCU rows within iMCU row */
+ int MCU_rows_per_iMCU_row; /* number of such rows needed */
+
+ /* The output side's location is represented by cinfo->output_iMCU_row. */
+
+ /* In single-pass modes, it's sufficient to buffer just one MCU.
+ * We allocate a workspace of D_MAX_BLOCKS_IN_MCU coefficient blocks,
+ * and let the entropy decoder write into that workspace each time.
+ * In multi-pass modes, this array points to the current MCU's blocks
+ * within the virtual arrays; it is used only by the input side.
+ */
+ JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
+
+ /* Temporary workspace for one MCU */
+ JCOEF *workspace;
+
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+ /* In multi-pass modes, we need a virtual block array for each component. */
+ jvirt_barray_ptr whole_image[MAX_COMPONENTS];
+#endif
+
+#ifdef BLOCK_SMOOTHING_SUPPORTED
+ /* When doing block smoothing, we latch coefficient Al values here */
+ int *coef_bits_latch;
+#define SAVED_COEFS 10 /* we save coef_bits[0..9] */
+#endif
+} my_coef_controller;
+
+typedef my_coef_controller *my_coef_ptr;
+
+
+LOCAL(void)
+start_iMCU_row(j_decompress_ptr cinfo)
+/* Reset within-iMCU-row counters for a new row (input side) */
+{
+ my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+
+ /* In an interleaved scan, an MCU row is the same as an iMCU row.
+ * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
+ * But at the bottom of the image, process only what's left.
+ */
+ if (cinfo->comps_in_scan > 1) {
+ coef->MCU_rows_per_iMCU_row = 1;
+ } else {
+ if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows - 1))
+ coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
+ else
+ coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
+ }
+
+ coef->MCU_ctr = 0;
+ coef->MCU_vert_offset = 0;
+}
diff --git a/media/libjpeg/jdcol565.c b/media/libjpeg/jdcol565.c
new file mode 100644
index 0000000000..53c7bd9187
--- /dev/null
+++ b/media/libjpeg/jdcol565.c
@@ -0,0 +1,384 @@
+/*
+ * jdcol565.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modifications:
+ * Copyright (C) 2013, Linaro Limited.
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains output colorspace conversion routines.
+ */
+
+/* This file is included by jdcolor.c */
+
+
+INLINE
+LOCAL(void)
+ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
+ register int y, cb, cr;
+ register JSAMPROW outptr;
+ register JSAMPROW inptr0, inptr1, inptr2;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->output_width;
+ /* copy these pointers into registers if possible */
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ register int *Crrtab = cconvert->Cr_r_tab;
+ register int *Cbbtab = cconvert->Cb_b_tab;
+ register JLONG *Crgtab = cconvert->Cr_g_tab;
+ register JLONG *Cbgtab = cconvert->Cb_g_tab;
+ SHIFT_TEMPS
+
+ while (--num_rows >= 0) {
+ JLONG rgb;
+ unsigned int r, g, b;
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+
+ if (PACK_NEED_ALIGNMENT(outptr)) {
+ y = *inptr0++;
+ cb = *inptr1++;
+ cr = *inptr2++;
+ r = range_limit[y + Crrtab[cr]];
+ g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+ SCALEBITS))];
+ b = range_limit[y + Cbbtab[cb]];
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr = (INT16)rgb;
+ outptr += 2;
+ num_cols--;
+ }
+ for (col = 0; col < (num_cols >> 1); col++) {
+ y = *inptr0++;
+ cb = *inptr1++;
+ cr = *inptr2++;
+ r = range_limit[y + Crrtab[cr]];
+ g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+ SCALEBITS))];
+ b = range_limit[y + Cbbtab[cb]];
+ rgb = PACK_SHORT_565(r, g, b);
+
+ y = *inptr0++;
+ cb = *inptr1++;
+ cr = *inptr2++;
+ r = range_limit[y + Crrtab[cr]];
+ g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+ SCALEBITS))];
+ b = range_limit[y + Cbbtab[cb]];
+ rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+ WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
+ outptr += 4;
+ }
+ if (num_cols & 1) {
+ y = *inptr0;
+ cb = *inptr1;
+ cr = *inptr2;
+ r = range_limit[y + Crrtab[cr]];
+ g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+ SCALEBITS))];
+ b = range_limit[y + Cbbtab[cb]];
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr = (INT16)rgb;
+ }
+ }
+}
+
+
+INLINE
+LOCAL(void)
+ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
+ register int y, cb, cr;
+ register JSAMPROW outptr;
+ register JSAMPROW inptr0, inptr1, inptr2;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->output_width;
+ /* copy these pointers into registers if possible */
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ register int *Crrtab = cconvert->Cr_r_tab;
+ register int *Cbbtab = cconvert->Cb_b_tab;
+ register JLONG *Crgtab = cconvert->Cr_g_tab;
+ register JLONG *Cbgtab = cconvert->Cb_g_tab;
+ JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+ SHIFT_TEMPS
+
+ while (--num_rows >= 0) {
+ JLONG rgb;
+ unsigned int r, g, b;
+
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ if (PACK_NEED_ALIGNMENT(outptr)) {
+ y = *inptr0++;
+ cb = *inptr1++;
+ cr = *inptr2++;
+ r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
+ g = range_limit[DITHER_565_G(y +
+ ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+ SCALEBITS)), d0)];
+ b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr = (INT16)rgb;
+ outptr += 2;
+ num_cols--;
+ }
+ for (col = 0; col < (num_cols >> 1); col++) {
+ y = *inptr0++;
+ cb = *inptr1++;
+ cr = *inptr2++;
+ r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
+ g = range_limit[DITHER_565_G(y +
+ ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+ SCALEBITS)), d0)];
+ b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
+ d0 = DITHER_ROTATE(d0);
+ rgb = PACK_SHORT_565(r, g, b);
+
+ y = *inptr0++;
+ cb = *inptr1++;
+ cr = *inptr2++;
+ r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
+ g = range_limit[DITHER_565_G(y +
+ ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+ SCALEBITS)), d0)];
+ b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
+ d0 = DITHER_ROTATE(d0);
+ rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+ WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
+ outptr += 4;
+ }
+ if (num_cols & 1) {
+ y = *inptr0;
+ cb = *inptr1;
+ cr = *inptr2;
+ r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
+ g = range_limit[DITHER_565_G(y +
+ ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+ SCALEBITS)), d0)];
+ b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr = (INT16)rgb;
+ }
+ }
+}
+
+
+INLINE
+LOCAL(void)
+rgb_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ register JSAMPROW outptr;
+ register JSAMPROW inptr0, inptr1, inptr2;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->output_width;
+ SHIFT_TEMPS
+
+ while (--num_rows >= 0) {
+ JLONG rgb;
+ unsigned int r, g, b;
+
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ if (PACK_NEED_ALIGNMENT(outptr)) {
+ r = *inptr0++;
+ g = *inptr1++;
+ b = *inptr2++;
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr = (INT16)rgb;
+ outptr += 2;
+ num_cols--;
+ }
+ for (col = 0; col < (num_cols >> 1); col++) {
+ r = *inptr0++;
+ g = *inptr1++;
+ b = *inptr2++;
+ rgb = PACK_SHORT_565(r, g, b);
+
+ r = *inptr0++;
+ g = *inptr1++;
+ b = *inptr2++;
+ rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+ WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
+ outptr += 4;
+ }
+ if (num_cols & 1) {
+ r = *inptr0;
+ g = *inptr1;
+ b = *inptr2;
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr = (INT16)rgb;
+ }
+ }
+}
+
+
+INLINE
+LOCAL(void)
+rgb_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ register JSAMPROW outptr;
+ register JSAMPROW inptr0, inptr1, inptr2;
+ register JDIMENSION col;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ JDIMENSION num_cols = cinfo->output_width;
+ JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+ SHIFT_TEMPS
+
+ while (--num_rows >= 0) {
+ JLONG rgb;
+ unsigned int r, g, b;
+
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ if (PACK_NEED_ALIGNMENT(outptr)) {
+ r = range_limit[DITHER_565_R(*inptr0++, d0)];
+ g = range_limit[DITHER_565_G(*inptr1++, d0)];
+ b = range_limit[DITHER_565_B(*inptr2++, d0)];
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr = (INT16)rgb;
+ outptr += 2;
+ num_cols--;
+ }
+ for (col = 0; col < (num_cols >> 1); col++) {
+ r = range_limit[DITHER_565_R(*inptr0++, d0)];
+ g = range_limit[DITHER_565_G(*inptr1++, d0)];
+ b = range_limit[DITHER_565_B(*inptr2++, d0)];
+ d0 = DITHER_ROTATE(d0);
+ rgb = PACK_SHORT_565(r, g, b);
+
+ r = range_limit[DITHER_565_R(*inptr0++, d0)];
+ g = range_limit[DITHER_565_G(*inptr1++, d0)];
+ b = range_limit[DITHER_565_B(*inptr2++, d0)];
+ d0 = DITHER_ROTATE(d0);
+ rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+ WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
+ outptr += 4;
+ }
+ if (num_cols & 1) {
+ r = range_limit[DITHER_565_R(*inptr0, d0)];
+ g = range_limit[DITHER_565_G(*inptr1, d0)];
+ b = range_limit[DITHER_565_B(*inptr2, d0)];
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr = (INT16)rgb;
+ }
+ }
+}
+
+
+INLINE
+LOCAL(void)
+gray_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ register JSAMPROW inptr, outptr;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->output_width;
+
+ while (--num_rows >= 0) {
+ JLONG rgb;
+ unsigned int g;
+
+ inptr = input_buf[0][input_row++];
+ outptr = *output_buf++;
+ if (PACK_NEED_ALIGNMENT(outptr)) {
+ g = *inptr++;
+ rgb = PACK_SHORT_565(g, g, g);
+ *(INT16 *)outptr = (INT16)rgb;
+ outptr += 2;
+ num_cols--;
+ }
+ for (col = 0; col < (num_cols >> 1); col++) {
+ g = *inptr++;
+ rgb = PACK_SHORT_565(g, g, g);
+ g = *inptr++;
+ rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(g, g, g));
+ WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
+ outptr += 4;
+ }
+ if (num_cols & 1) {
+ g = *inptr;
+ rgb = PACK_SHORT_565(g, g, g);
+ *(INT16 *)outptr = (INT16)rgb;
+ }
+ }
+}
+
+
+INLINE
+LOCAL(void)
+gray_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ register JSAMPROW inptr, outptr;
+ register JDIMENSION col;
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ JDIMENSION num_cols = cinfo->output_width;
+ JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+
+ while (--num_rows >= 0) {
+ JLONG rgb;
+ unsigned int g;
+
+ inptr = input_buf[0][input_row++];
+ outptr = *output_buf++;
+ if (PACK_NEED_ALIGNMENT(outptr)) {
+ g = *inptr++;
+ g = range_limit[DITHER_565_R(g, d0)];
+ rgb = PACK_SHORT_565(g, g, g);
+ *(INT16 *)outptr = (INT16)rgb;
+ outptr += 2;
+ num_cols--;
+ }
+ for (col = 0; col < (num_cols >> 1); col++) {
+ g = *inptr++;
+ g = range_limit[DITHER_565_R(g, d0)];
+ rgb = PACK_SHORT_565(g, g, g);
+ d0 = DITHER_ROTATE(d0);
+
+ g = *inptr++;
+ g = range_limit[DITHER_565_R(g, d0)];
+ rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(g, g, g));
+ d0 = DITHER_ROTATE(d0);
+
+ WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
+ outptr += 4;
+ }
+ if (num_cols & 1) {
+ g = *inptr;
+ g = range_limit[DITHER_565_R(g, d0)];
+ rgb = PACK_SHORT_565(g, g, g);
+ *(INT16 *)outptr = (INT16)rgb;
+ }
+ }
+}
diff --git a/media/libjpeg/jdcolext.c b/media/libjpeg/jdcolext.c
new file mode 100644
index 0000000000..fc7e7b8f00
--- /dev/null
+++ b/media/libjpeg/jdcolext.c
@@ -0,0 +1,141 @@
+/*
+ * jdcolext.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2011, 2015, 2023, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains output colorspace conversion routines.
+ */
+
+
+/* This file is included by jdcolor.c */
+
+
+/*
+ * Convert some rows of samples to the output colorspace.
+ *
+ * Note that we change from noninterleaved, one-plane-per-component format
+ * to interleaved-pixel format. The output buffer is therefore three times
+ * as wide as the input buffer.
+ * A starting row offset is provided only for the input buffer. The caller
+ * can easily adjust the passed output_buf value to accommodate any row
+ * offset required on that side.
+ */
+
+INLINE
+LOCAL(void)
+ycc_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
+ register int y, cb, cr;
+ register JSAMPROW outptr;
+ register JSAMPROW inptr0, inptr1, inptr2;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->output_width;
+ /* copy these pointers into registers if possible */
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ register int *Crrtab = cconvert->Cr_r_tab;
+ register int *Cbbtab = cconvert->Cb_b_tab;
+ register JLONG *Crgtab = cconvert->Cr_g_tab;
+ register JLONG *Cbgtab = cconvert->Cb_g_tab;
+ SHIFT_TEMPS
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ for (col = 0; col < num_cols; col++) {
+ y = inptr0[col];
+ cb = inptr1[col];
+ cr = inptr2[col];
+ /* Range-limiting is essential due to noise introduced by DCT losses. */
+ outptr[RGB_RED] = range_limit[y + Crrtab[cr]];
+ outptr[RGB_GREEN] = range_limit[y +
+ ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+ SCALEBITS))];
+ outptr[RGB_BLUE] = range_limit[y + Cbbtab[cb]];
+ /* Set unused byte to MAXJSAMPLE so it can be interpreted as an opaque */
+ /* alpha channel value */
+#ifdef RGB_ALPHA
+ outptr[RGB_ALPHA] = MAXJSAMPLE;
+#endif
+ outptr += RGB_PIXELSIZE;
+ }
+ }
+}
+
+
+/*
+ * Convert grayscale to RGB: just duplicate the graylevel three times.
+ * This is provided to support applications that don't want to cope
+ * with grayscale as a separate case.
+ */
+
+INLINE
+LOCAL(void)
+gray_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ register JSAMPROW inptr, outptr;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->output_width;
+
+ while (--num_rows >= 0) {
+ inptr = input_buf[0][input_row++];
+ outptr = *output_buf++;
+ for (col = 0; col < num_cols; col++) {
+ outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
+ /* Set unused byte to MAXJSAMPLE so it can be interpreted as an opaque */
+ /* alpha channel value */
+#ifdef RGB_ALPHA
+ outptr[RGB_ALPHA] = MAXJSAMPLE;
+#endif
+ outptr += RGB_PIXELSIZE;
+ }
+ }
+}
+
+
+/*
+ * Convert RGB to extended RGB: just swap the order of source pixels
+ */
+
+INLINE
+LOCAL(void)
+rgb_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ register JSAMPROW inptr0, inptr1, inptr2;
+ register JSAMPROW outptr;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->output_width;
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ for (col = 0; col < num_cols; col++) {
+ outptr[RGB_RED] = inptr0[col];
+ outptr[RGB_GREEN] = inptr1[col];
+ outptr[RGB_BLUE] = inptr2[col];
+ /* Set unused byte to MAXJSAMPLE so it can be interpreted as an opaque */
+ /* alpha channel value */
+#ifdef RGB_ALPHA
+ outptr[RGB_ALPHA] = MAXJSAMPLE;
+#endif
+ outptr += RGB_PIXELSIZE;
+ }
+ }
+}
diff --git a/media/libjpeg/jdcolor.c b/media/libjpeg/jdcolor.c
new file mode 100644
index 0000000000..735190b700
--- /dev/null
+++ b/media/libjpeg/jdcolor.c
@@ -0,0 +1,881 @@
+/*
+ * jdcolor.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2011 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009, 2011-2012, 2014-2015, D. R. Commander.
+ * Copyright (C) 2013, Linaro Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains output colorspace conversion routines.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jsimd.h"
+
+
+/* Private subobject */
+
+typedef struct {
+ struct jpeg_color_deconverter pub; /* public fields */
+
+ /* Private state for YCC->RGB conversion */
+ int *Cr_r_tab; /* => table for Cr to R conversion */
+ int *Cb_b_tab; /* => table for Cb to B conversion */
+ JLONG *Cr_g_tab; /* => table for Cr to G conversion */
+ JLONG *Cb_g_tab; /* => table for Cb to G conversion */
+
+ /* Private state for RGB->Y conversion */
+ JLONG *rgb_y_tab; /* => table for RGB to Y conversion */
+} my_color_deconverter;
+
+typedef my_color_deconverter *my_cconvert_ptr;
+
+
+/**************** YCbCr -> RGB conversion: most common case **************/
+/**************** RGB -> Y conversion: less common case **************/
+
+/*
+ * YCbCr is defined per CCIR 601-1, except that Cb and Cr are
+ * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5.
+ * The conversion equations to be implemented are therefore
+ *
+ * R = Y + 1.40200 * Cr
+ * G = Y - 0.34414 * Cb - 0.71414 * Cr
+ * B = Y + 1.77200 * Cb
+ *
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * where Cb and Cr represent the incoming values less CENTERJSAMPLE.
+ * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.)
+ *
+ * To avoid floating-point arithmetic, we represent the fractional constants
+ * as integers scaled up by 2^16 (about 4 digits precision); we have to divide
+ * the products by 2^16, with appropriate rounding, to get the correct answer.
+ * Notice that Y, being an integral input, does not contribute any fraction
+ * so it need not participate in the rounding.
+ *
+ * For even more speed, we avoid doing any multiplications in the inner loop
+ * by precalculating the constants times Cb and Cr for all possible values.
+ * For 8-bit JSAMPLEs this is very reasonable (only 256 entries per table);
+ * for 12-bit samples it is still acceptable. It's not very reasonable for
+ * 16-bit samples, but if you want lossless storage you shouldn't be changing
+ * colorspace anyway.
+ * The Cr=>R and Cb=>B values can be rounded to integers in advance; the
+ * values for the G calculation are left scaled up, since we must add them
+ * together before rounding.
+ */
+
+#define SCALEBITS 16 /* speediest right-shift on some machines */
+#define ONE_HALF ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x) ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
+
+/* We allocate one big table for RGB->Y conversion and divide it up into
+ * three parts, instead of doing three alloc_small requests. This lets us
+ * use a single table base address, which can be held in a register in the
+ * inner loops on many machines (more than can hold all three addresses,
+ * anyway).
+ */
+
+#define R_Y_OFF 0 /* offset to R => Y section */
+#define G_Y_OFF (1 * (MAXJSAMPLE + 1)) /* offset to G => Y section */
+#define B_Y_OFF (2 * (MAXJSAMPLE + 1)) /* etc. */
+#define TABLE_SIZE (3 * (MAXJSAMPLE + 1))
+
+
+/* Include inline routines for colorspace extensions */
+
+#include "jdcolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define ycc_rgb_convert_internal ycc_extrgb_convert_internal
+#define gray_rgb_convert_internal gray_extrgb_convert_internal
+#define rgb_rgb_convert_internal rgb_extrgb_convert_internal
+#include "jdcolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef ycc_rgb_convert_internal
+#undef gray_rgb_convert_internal
+#undef rgb_rgb_convert_internal
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define ycc_rgb_convert_internal ycc_extrgbx_convert_internal
+#define gray_rgb_convert_internal gray_extrgbx_convert_internal
+#define rgb_rgb_convert_internal rgb_extrgbx_convert_internal
+#include "jdcolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef ycc_rgb_convert_internal
+#undef gray_rgb_convert_internal
+#undef rgb_rgb_convert_internal
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define ycc_rgb_convert_internal ycc_extbgr_convert_internal
+#define gray_rgb_convert_internal gray_extbgr_convert_internal
+#define rgb_rgb_convert_internal rgb_extbgr_convert_internal
+#include "jdcolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef ycc_rgb_convert_internal
+#undef gray_rgb_convert_internal
+#undef rgb_rgb_convert_internal
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define ycc_rgb_convert_internal ycc_extbgrx_convert_internal
+#define gray_rgb_convert_internal gray_extbgrx_convert_internal
+#define rgb_rgb_convert_internal rgb_extbgrx_convert_internal
+#include "jdcolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef ycc_rgb_convert_internal
+#undef gray_rgb_convert_internal
+#undef rgb_rgb_convert_internal
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define ycc_rgb_convert_internal ycc_extxbgr_convert_internal
+#define gray_rgb_convert_internal gray_extxbgr_convert_internal
+#define rgb_rgb_convert_internal rgb_extxbgr_convert_internal
+#include "jdcolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef ycc_rgb_convert_internal
+#undef gray_rgb_convert_internal
+#undef rgb_rgb_convert_internal
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define ycc_rgb_convert_internal ycc_extxrgb_convert_internal
+#define gray_rgb_convert_internal gray_extxrgb_convert_internal
+#define rgb_rgb_convert_internal rgb_extxrgb_convert_internal
+#include "jdcolext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef ycc_rgb_convert_internal
+#undef gray_rgb_convert_internal
+#undef rgb_rgb_convert_internal
+
+
+/*
+ * Initialize tables for YCC->RGB colorspace conversion.
+ */
+
+LOCAL(void)
+build_ycc_rgb_table(j_decompress_ptr cinfo)
+{
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
+ int i;
+ JLONG x;
+ SHIFT_TEMPS
+
+ cconvert->Cr_r_tab = (int *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(int));
+ cconvert->Cb_b_tab = (int *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(int));
+ cconvert->Cr_g_tab = (JLONG *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(JLONG));
+ cconvert->Cb_g_tab = (JLONG *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(JLONG));
+
+ for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
+ /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
+ /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */
+ /* Cr=>R value is nearest int to 1.40200 * x */
+ cconvert->Cr_r_tab[i] = (int)
+ RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
+ /* Cb=>B value is nearest int to 1.77200 * x */
+ cconvert->Cb_b_tab[i] = (int)
+ RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
+ /* Cr=>G value is scaled-up -0.71414 * x */
+ cconvert->Cr_g_tab[i] = (-FIX(0.71414)) * x;
+ /* Cb=>G value is scaled-up -0.34414 * x */
+ /* We also add in ONE_HALF so that need not do it in inner loop */
+ cconvert->Cb_g_tab[i] = (-FIX(0.34414)) * x + ONE_HALF;
+ }
+}
+
+
+/*
+ * Convert some rows of samples to the output colorspace.
+ */
+
+METHODDEF(void)
+ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ ycc_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ ycc_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_BGR:
+ ycc_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ ycc_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ ycc_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ ycc_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ default:
+ ycc_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ }
+}
+
+
+/**************** Cases other than YCbCr -> RGB **************/
+
+
+/*
+ * Initialize for RGB->grayscale colorspace conversion.
+ */
+
+LOCAL(void)
+build_rgb_y_table(j_decompress_ptr cinfo)
+{
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
+ JLONG *rgb_y_tab;
+ JLONG i;
+
+ /* Allocate and fill in the conversion tables. */
+ cconvert->rgb_y_tab = rgb_y_tab = (JLONG *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (TABLE_SIZE * sizeof(JLONG)));
+
+ for (i = 0; i <= MAXJSAMPLE; i++) {
+ rgb_y_tab[i + R_Y_OFF] = FIX(0.29900) * i;
+ rgb_y_tab[i + G_Y_OFF] = FIX(0.58700) * i;
+ rgb_y_tab[i + B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
+ }
+}
+
+
+/*
+ * Convert RGB to grayscale.
+ */
+
+METHODDEF(void)
+rgb_gray_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
+ register int r, g, b;
+ register JLONG *ctab = cconvert->rgb_y_tab;
+ register JSAMPROW outptr;
+ register JSAMPROW inptr0, inptr1, inptr2;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->output_width;
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ for (col = 0; col < num_cols; col++) {
+ r = inptr0[col];
+ g = inptr1[col];
+ b = inptr2[col];
+ /* Y */
+ outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+ ctab[b + B_Y_OFF]) >> SCALEBITS);
+ }
+ }
+}
+
+
+/*
+ * Color conversion for no colorspace change: just copy the data,
+ * converting from separate-planes to interleaved representation.
+ */
+
+METHODDEF(void)
+null_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ register JSAMPROW inptr, inptr0, inptr1, inptr2, inptr3, outptr;
+ register JDIMENSION col;
+ register int num_components = cinfo->num_components;
+ JDIMENSION num_cols = cinfo->output_width;
+ int ci;
+
+ if (num_components == 3) {
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ for (col = 0; col < num_cols; col++) {
+ *outptr++ = inptr0[col];
+ *outptr++ = inptr1[col];
+ *outptr++ = inptr2[col];
+ }
+ }
+ } else if (num_components == 4) {
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ inptr3 = input_buf[3][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ for (col = 0; col < num_cols; col++) {
+ *outptr++ = inptr0[col];
+ *outptr++ = inptr1[col];
+ *outptr++ = inptr2[col];
+ *outptr++ = inptr3[col];
+ }
+ }
+ } else {
+ while (--num_rows >= 0) {
+ for (ci = 0; ci < num_components; ci++) {
+ inptr = input_buf[ci][input_row];
+ outptr = *output_buf;
+ for (col = 0; col < num_cols; col++) {
+ outptr[ci] = inptr[col];
+ outptr += num_components;
+ }
+ }
+ output_buf++;
+ input_row++;
+ }
+ }
+}
+
+
+/*
+ * Color conversion for grayscale: just copy the data.
+ * This also works for YCbCr -> grayscale conversion, in which
+ * we just copy the Y (luminance) component and ignore chrominance.
+ */
+
+METHODDEF(void)
+grayscale_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ jcopy_sample_rows(input_buf[0], (int)input_row, output_buf, 0, num_rows,
+ cinfo->output_width);
+}
+
+
+/*
+ * Convert grayscale to RGB
+ */
+
+METHODDEF(void)
+gray_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ gray_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ gray_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_BGR:
+ gray_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ gray_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ gray_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ gray_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ default:
+ gray_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ }
+}
+
+
+/*
+ * Convert plain RGB to extended RGB
+ */
+
+METHODDEF(void)
+rgb_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ rgb_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ rgb_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_BGR:
+ rgb_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ rgb_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ rgb_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ rgb_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ default:
+ rgb_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+ num_rows);
+ break;
+ }
+}
+
+
+/*
+ * Adobe-style YCCK->CMYK conversion.
+ * We convert YCbCr to R=1-C, G=1-M, and B=1-Y using the same
+ * conversion as above, while passing K (black) unchanged.
+ * We assume build_ycc_rgb_table has been called.
+ */
+
+METHODDEF(void)
+ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
+ register int y, cb, cr;
+ register JSAMPROW outptr;
+ register JSAMPROW inptr0, inptr1, inptr2, inptr3;
+ register JDIMENSION col;
+ JDIMENSION num_cols = cinfo->output_width;
+ /* copy these pointers into registers if possible */
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ register int *Crrtab = cconvert->Cr_r_tab;
+ register int *Cbbtab = cconvert->Cb_b_tab;
+ register JLONG *Crgtab = cconvert->Cr_g_tab;
+ register JLONG *Cbgtab = cconvert->Cb_g_tab;
+ SHIFT_TEMPS
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ inptr3 = input_buf[3][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ for (col = 0; col < num_cols; col++) {
+ y = inptr0[col];
+ cb = inptr1[col];
+ cr = inptr2[col];
+ /* Range-limiting is essential due to noise introduced by DCT losses. */
+ outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])]; /* red */
+ outptr[1] = range_limit[MAXJSAMPLE - (y + /* green */
+ ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+ SCALEBITS)))];
+ outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])]; /* blue */
+ /* K passes through unchanged */
+ outptr[3] = inptr3[col];
+ outptr += 4;
+ }
+ }
+}
+
+
+/*
+ * RGB565 conversion
+ */
+
+#define PACK_SHORT_565_LE(r, g, b) \
+ ((((r) << 8) & 0xF800) | (((g) << 3) & 0x7E0) | ((b) >> 3))
+#define PACK_SHORT_565_BE(r, g, b) \
+ (((r) & 0xF8) | ((g) >> 5) | (((g) << 11) & 0xE000) | (((b) << 5) & 0x1F00))
+
+#define PACK_TWO_PIXELS_LE(l, r) ((r << 16) | l)
+#define PACK_TWO_PIXELS_BE(l, r) ((l << 16) | r)
+
+#define PACK_NEED_ALIGNMENT(ptr) (((size_t)(ptr)) & 3)
+
+#define WRITE_TWO_ALIGNED_PIXELS(addr, pixels) ((*(int *)(addr)) = pixels)
+
+#define DITHER_565_R(r, dither) ((r) + ((dither) & 0xFF))
+#define DITHER_565_G(g, dither) ((g) + (((dither) & 0xFF) >> 1))
+#define DITHER_565_B(b, dither) ((b) + ((dither) & 0xFF))
+
+
+/* Declarations for ordered dithering
+ *
+ * We use a 4x4 ordered dither array packed into 32 bits. This array is
+ * sufficient for dithering RGB888 to RGB565.
+ */
+
+#define DITHER_MASK 0x3
+#define DITHER_ROTATE(x) ((((x) & 0xFF) << 24) | (((x) >> 8) & 0x00FFFFFF))
+static const JLONG dither_matrix[4] = {
+ 0x0008020A,
+ 0x0C040E06,
+ 0x030B0109,
+ 0x0F070D05
+};
+
+
+static INLINE boolean is_big_endian(void)
+{
+ int test_value = 1;
+ if (*(char *)&test_value != 1)
+ return TRUE;
+ return FALSE;
+}
+
+
+/* Include inline routines for RGB565 conversion */
+
+#define PACK_SHORT_565 PACK_SHORT_565_LE
+#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE
+#define ycc_rgb565_convert_internal ycc_rgb565_convert_le
+#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_le
+#define rgb_rgb565_convert_internal rgb_rgb565_convert_le
+#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_le
+#define gray_rgb565_convert_internal gray_rgb565_convert_le
+#define gray_rgb565D_convert_internal gray_rgb565D_convert_le
+#include "jdcol565.c"
+#undef PACK_SHORT_565
+#undef PACK_TWO_PIXELS
+#undef ycc_rgb565_convert_internal
+#undef ycc_rgb565D_convert_internal
+#undef rgb_rgb565_convert_internal
+#undef rgb_rgb565D_convert_internal
+#undef gray_rgb565_convert_internal
+#undef gray_rgb565D_convert_internal
+
+#define PACK_SHORT_565 PACK_SHORT_565_BE
+#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE
+#define ycc_rgb565_convert_internal ycc_rgb565_convert_be
+#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_be
+#define rgb_rgb565_convert_internal rgb_rgb565_convert_be
+#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_be
+#define gray_rgb565_convert_internal gray_rgb565_convert_be
+#define gray_rgb565D_convert_internal gray_rgb565D_convert_be
+#include "jdcol565.c"
+#undef PACK_SHORT_565
+#undef PACK_TWO_PIXELS
+#undef ycc_rgb565_convert_internal
+#undef ycc_rgb565D_convert_internal
+#undef rgb_rgb565_convert_internal
+#undef rgb_rgb565D_convert_internal
+#undef gray_rgb565_convert_internal
+#undef gray_rgb565D_convert_internal
+
+
+METHODDEF(void)
+ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ if (is_big_endian())
+ ycc_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
+ else
+ ycc_rgb565_convert_le(cinfo, input_buf, input_row, output_buf, num_rows);
+}
+
+
+METHODDEF(void)
+ycc_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ if (is_big_endian())
+ ycc_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
+ else
+ ycc_rgb565D_convert_le(cinfo, input_buf, input_row, output_buf, num_rows);
+}
+
+
+METHODDEF(void)
+rgb_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ if (is_big_endian())
+ rgb_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
+ else
+ rgb_rgb565_convert_le(cinfo, input_buf, input_row, output_buf, num_rows);
+}
+
+
+METHODDEF(void)
+rgb_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ if (is_big_endian())
+ rgb_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
+ else
+ rgb_rgb565D_convert_le(cinfo, input_buf, input_row, output_buf, num_rows);
+}
+
+
+METHODDEF(void)
+gray_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ if (is_big_endian())
+ gray_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
+ else
+ gray_rgb565_convert_le(cinfo, input_buf, input_row, output_buf, num_rows);
+}
+
+
+METHODDEF(void)
+gray_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+ if (is_big_endian())
+ gray_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
+ else
+ gray_rgb565D_convert_le(cinfo, input_buf, input_row, output_buf, num_rows);
+}
+
+
+/*
+ * Empty method for start_pass.
+ */
+
+METHODDEF(void)
+start_pass_dcolor(j_decompress_ptr cinfo)
+{
+ /* no work needed */
+}
+
+
+/*
+ * Module initialization routine for output colorspace conversion.
+ */
+
+GLOBAL(void)
+jinit_color_deconverter(j_decompress_ptr cinfo)
+{
+ my_cconvert_ptr cconvert;
+ int ci;
+
+ cconvert = (my_cconvert_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_color_deconverter));
+ cinfo->cconvert = (struct jpeg_color_deconverter *)cconvert;
+ cconvert->pub.start_pass = start_pass_dcolor;
+
+ /* Make sure num_components agrees with jpeg_color_space */
+ switch (cinfo->jpeg_color_space) {
+ case JCS_GRAYSCALE:
+ if (cinfo->num_components != 1)
+ ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
+ break;
+
+ case JCS_RGB:
+ case JCS_YCbCr:
+ if (cinfo->num_components != 3)
+ ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
+ break;
+
+ case JCS_CMYK:
+ case JCS_YCCK:
+ if (cinfo->num_components != 4)
+ ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
+ break;
+
+ default: /* JCS_UNKNOWN can be anything */
+ if (cinfo->num_components < 1)
+ ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
+ break;
+ }
+
+ /* Set out_color_components and conversion method based on requested space.
+ * Also clear the component_needed flags for any unused components,
+ * so that earlier pipeline stages can avoid useless computation.
+ */
+
+ switch (cinfo->out_color_space) {
+ case JCS_GRAYSCALE:
+ cinfo->out_color_components = 1;
+ if (cinfo->jpeg_color_space == JCS_GRAYSCALE ||
+ cinfo->jpeg_color_space == JCS_YCbCr) {
+ cconvert->pub.color_convert = grayscale_convert;
+ /* For color->grayscale conversion, only the Y (0) component is needed */
+ for (ci = 1; ci < cinfo->num_components; ci++)
+ cinfo->comp_info[ci].component_needed = FALSE;
+ } else if (cinfo->jpeg_color_space == JCS_RGB) {
+ cconvert->pub.color_convert = rgb_gray_convert;
+ build_rgb_y_table(cinfo);
+ } else
+ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+ break;
+
+ case JCS_RGB:
+ case JCS_EXT_RGB:
+ case JCS_EXT_RGBX:
+ case JCS_EXT_BGR:
+ case JCS_EXT_BGRX:
+ case JCS_EXT_XBGR:
+ case JCS_EXT_XRGB:
+ case JCS_EXT_RGBA:
+ case JCS_EXT_BGRA:
+ case JCS_EXT_ABGR:
+ case JCS_EXT_ARGB:
+ cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space];
+ if (cinfo->jpeg_color_space == JCS_YCbCr) {
+ if (jsimd_can_ycc_rgb())
+ cconvert->pub.color_convert = jsimd_ycc_rgb_convert;
+ else {
+ cconvert->pub.color_convert = ycc_rgb_convert;
+ build_ycc_rgb_table(cinfo);
+ }
+ } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
+ cconvert->pub.color_convert = gray_rgb_convert;
+ } else if (cinfo->jpeg_color_space == JCS_RGB) {
+ if (rgb_red[cinfo->out_color_space] == 0 &&
+ rgb_green[cinfo->out_color_space] == 1 &&
+ rgb_blue[cinfo->out_color_space] == 2 &&
+ rgb_pixelsize[cinfo->out_color_space] == 3)
+ cconvert->pub.color_convert = null_convert;
+ else
+ cconvert->pub.color_convert = rgb_rgb_convert;
+ } else
+ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+ break;
+
+ case JCS_RGB565:
+ cinfo->out_color_components = 3;
+ if (cinfo->dither_mode == JDITHER_NONE) {
+ if (cinfo->jpeg_color_space == JCS_YCbCr) {
+ if (jsimd_can_ycc_rgb565())
+ cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
+ else {
+ cconvert->pub.color_convert = ycc_rgb565_convert;
+ build_ycc_rgb_table(cinfo);
+ }
+ } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
+ cconvert->pub.color_convert = gray_rgb565_convert;
+ } else if (cinfo->jpeg_color_space == JCS_RGB) {
+ cconvert->pub.color_convert = rgb_rgb565_convert;
+ } else
+ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+ } else {
+ /* only ordered dithering is supported */
+ if (cinfo->jpeg_color_space == JCS_YCbCr) {
+ cconvert->pub.color_convert = ycc_rgb565D_convert;
+ build_ycc_rgb_table(cinfo);
+ } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
+ cconvert->pub.color_convert = gray_rgb565D_convert;
+ } else if (cinfo->jpeg_color_space == JCS_RGB) {
+ cconvert->pub.color_convert = rgb_rgb565D_convert;
+ } else
+ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+ }
+ break;
+
+ case JCS_CMYK:
+ cinfo->out_color_components = 4;
+ if (cinfo->jpeg_color_space == JCS_YCCK) {
+ cconvert->pub.color_convert = ycck_cmyk_convert;
+ build_ycc_rgb_table(cinfo);
+ } else if (cinfo->jpeg_color_space == JCS_CMYK) {
+ cconvert->pub.color_convert = null_convert;
+ } else
+ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+ break;
+
+ default:
+ /* Permit null conversion to same output space */
+ if (cinfo->out_color_space == cinfo->jpeg_color_space) {
+ cinfo->out_color_components = cinfo->num_components;
+ cconvert->pub.color_convert = null_convert;
+ } else /* unsupported non-null conversion */
+ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+ break;
+ }
+
+ if (cinfo->quantize_colors)
+ cinfo->output_components = 1; /* single colormapped output component */
+ else
+ cinfo->output_components = cinfo->out_color_components;
+}
diff --git a/media/libjpeg/jdct.h b/media/libjpeg/jdct.h
new file mode 100644
index 0000000000..66d1718b77
--- /dev/null
+++ b/media/libjpeg/jdct.h
@@ -0,0 +1,208 @@
+/*
+ * jdct.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This include file contains common declarations for the forward and
+ * inverse DCT modules. These declarations are private to the DCT managers
+ * (jcdctmgr.c, jddctmgr.c) and the individual DCT algorithms.
+ * The individual DCT algorithms are kept in separate files to ease
+ * machine-dependent tuning (e.g., assembly coding).
+ */
+
+
+/*
+ * A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+ * the DCT is to be performed in-place in that buffer. Type DCTELEM is int
+ * for 8-bit samples, JLONG for 12-bit samples. (NOTE: Floating-point DCT
+ * implementations use an array of type FAST_FLOAT, instead.)
+ * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE).
+ * The DCT outputs are returned scaled up by a factor of 8; they therefore
+ * have a range of +-8K for 8-bit data, +-128K for 12-bit data. This
+ * convention improves accuracy in integer implementations and saves some
+ * work in floating-point ones.
+ * Quantization of the output coefficients is done by jcdctmgr.c. This
+ * step requires an unsigned type and also one with twice the bits.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#ifndef WITH_SIMD
+typedef int DCTELEM; /* 16 or 32 bits is fine */
+typedef unsigned int UDCTELEM;
+typedef unsigned long long UDCTELEM2;
+#else
+typedef short DCTELEM; /* prefer 16 bit with SIMD for parellelism */
+typedef unsigned short UDCTELEM;
+typedef unsigned int UDCTELEM2;
+#endif
+#else
+typedef JLONG DCTELEM; /* must have 32 bits */
+typedef unsigned long long UDCTELEM2;
+#endif
+
+
+/*
+ * An inverse DCT routine is given a pointer to the input JBLOCK and a pointer
+ * to an output sample array. The routine must dequantize the input data as
+ * well as perform the IDCT; for dequantization, it uses the multiplier table
+ * pointed to by compptr->dct_table. The output data is to be placed into the
+ * sample array starting at a specified column. (Any row offset needed will
+ * be applied to the array pointer before it is passed to the IDCT code.)
+ * Note that the number of samples emitted by the IDCT routine is
+ * DCT_scaled_size * DCT_scaled_size.
+ */
+
+/* typedef inverse_DCT_method_ptr is declared in jpegint.h */
+
+/*
+ * Each IDCT routine has its own ideas about the best dct_table element type.
+ */
+
+typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
+#if BITS_IN_JSAMPLE == 8
+typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
+#define IFAST_SCALE_BITS 2 /* fractional bits in scale factors */
+#else
+typedef JLONG IFAST_MULT_TYPE; /* need 32 bits for scaled quantizers */
+#define IFAST_SCALE_BITS 13 /* fractional bits in scale factors */
+#endif
+typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
+
+
+/*
+ * Each IDCT routine is responsible for range-limiting its results and
+ * converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could
+ * be quite far out of range if the input data is corrupt, so a bulletproof
+ * range-limiting step is required. We use a mask-and-table-lookup method
+ * to do the combined operations quickly. See the comments with
+ * prepare_range_limit_table (in jdmaster.c) for more info.
+ */
+
+#define IDCT_range_limit(cinfo) ((cinfo)->sample_range_limit + CENTERJSAMPLE)
+
+#define RANGE_MASK (MAXJSAMPLE * 4 + 3) /* 2 bits wider than legal samples */
+
+
+/* Extern declarations for the forward and inverse DCT routines. */
+
+EXTERN(void) jpeg_fdct_islow(DCTELEM *data);
+EXTERN(void) jpeg_fdct_ifast(DCTELEM *data);
+EXTERN(void) jpeg_fdct_float(FAST_FLOAT *data);
+
+EXTERN(void) jpeg_idct_islow(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_ifast(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_float(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_7x7(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_6x6(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_5x5(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_4x4(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_3x3(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_2x2(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_1x1(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_9x9(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_10x10(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_11x11(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_12x12(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_13x13(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_14x14(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_15x15(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_16x16(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+
+
+/*
+ * Macros for handling fixed-point arithmetic; these are used by many
+ * but not all of the DCT/IDCT modules.
+ *
+ * All values are expected to be of type JLONG.
+ * Fractional constants are scaled left by CONST_BITS bits.
+ * CONST_BITS is defined within each module using these macros,
+ * and may differ from one module to the next.
+ */
+
+#define ONE ((JLONG)1)
+#define CONST_SCALE (ONE << CONST_BITS)
+
+/* Convert a positive real constant to an integer scaled by CONST_SCALE.
+ * Caution: some C compilers fail to reduce "FIX(constant)" at compile time,
+ * thus causing a lot of useless floating-point operations at run time.
+ */
+
+#define FIX(x) ((JLONG)((x) * CONST_SCALE + 0.5))
+
+/* Descale and correctly round a JLONG value that's scaled by N bits.
+ * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
+ * the fudge factor is correct for either sign of X.
+ */
+
+#define DESCALE(x, n) RIGHT_SHIFT((x) + (ONE << ((n) - 1)), n)
+
+/* Multiply a JLONG variable by a JLONG constant to yield a JLONG result.
+ * This macro is used only when the two inputs will actually be no more than
+ * 16 bits wide, so that a 16x16->32 bit multiply can be used instead of a
+ * full 32x32 multiply. This provides a useful speedup on many machines.
+ * Unfortunately there is no way to specify a 16x16->32 multiply portably
+ * in C, but some C compilers will do the right thing if you provide the
+ * correct combination of casts.
+ */
+
+#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */
+#define MULTIPLY16C16(var, const) (((INT16)(var)) * ((INT16)(const)))
+#endif
+#ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */
+#define MULTIPLY16C16(var, const) (((INT16)(var)) * ((JLONG)(const)))
+#endif
+
+#ifndef MULTIPLY16C16 /* default definition */
+#define MULTIPLY16C16(var, const) ((var) * (const))
+#endif
+
+/* Same except both inputs are variables. */
+
+#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */
+#define MULTIPLY16V16(var1, var2) (((INT16)(var1)) * ((INT16)(var2)))
+#endif
+
+#ifndef MULTIPLY16V16 /* default definition */
+#define MULTIPLY16V16(var1, var2) ((var1) * (var2))
+#endif
diff --git a/media/libjpeg/jddctmgr.c b/media/libjpeg/jddctmgr.c
new file mode 100644
index 0000000000..e78d7bebe2
--- /dev/null
+++ b/media/libjpeg/jddctmgr.c
@@ -0,0 +1,352 @@
+/*
+ * jddctmgr.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2002-2010 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2010, 2015, 2022, D. R. Commander.
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the inverse-DCT management logic.
+ * This code selects a particular IDCT implementation to be used,
+ * and it performs related housekeeping chores. No code in this file
+ * is executed per IDCT step, only during output pass setup.
+ *
+ * Note that the IDCT routines are responsible for performing coefficient
+ * dequantization as well as the IDCT proper. This module sets up the
+ * dequantization multiplier table needed by the IDCT routine.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h" /* Private declarations for DCT subsystem */
+#include "jsimddct.h"
+#include "jpegcomp.h"
+
+
+/*
+ * The decompressor input side (jdinput.c) saves away the appropriate
+ * quantization table for each component at the start of the first scan
+ * involving that component. (This is necessary in order to correctly
+ * decode files that reuse Q-table slots.)
+ * When we are ready to make an output pass, the saved Q-table is converted
+ * to a multiplier table that will actually be used by the IDCT routine.
+ * The multiplier table contents are IDCT-method-dependent. To support
+ * application changes in IDCT method between scans, we can remake the
+ * multiplier tables if necessary.
+ * In buffered-image mode, the first output pass may occur before any data
+ * has been seen for some components, and thus before their Q-tables have
+ * been saved away. To handle this case, multiplier tables are preset
+ * to zeroes; the result of the IDCT will be a neutral gray level.
+ */
+
+
+/* Private subobject for this module */
+
+typedef struct {
+ struct jpeg_inverse_dct pub; /* public fields */
+
+ /* This array contains the IDCT method code that each multiplier table
+ * is currently set up for, or -1 if it's not yet set up.
+ * The actual multiplier tables are pointed to by dct_table in the
+ * per-component comp_info structures.
+ */
+ int cur_method[MAX_COMPONENTS];
+} my_idct_controller;
+
+typedef my_idct_controller *my_idct_ptr;
+
+
+/* Allocated multiplier tables: big enough for any supported variant */
+
+typedef union {
+ ISLOW_MULT_TYPE islow_array[DCTSIZE2];
+#ifdef DCT_IFAST_SUPPORTED
+ IFAST_MULT_TYPE ifast_array[DCTSIZE2];
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+ FLOAT_MULT_TYPE float_array[DCTSIZE2];
+#endif
+} multiplier_table;
+
+
+/* The current scaled-IDCT routines require ISLOW-style multiplier tables,
+ * so be sure to compile that code if either ISLOW or SCALING is requested.
+ */
+#ifdef DCT_ISLOW_SUPPORTED
+#define PROVIDE_ISLOW_TABLES
+#else
+#ifdef IDCT_SCALING_SUPPORTED
+#define PROVIDE_ISLOW_TABLES
+#endif
+#endif
+
+
+/*
+ * Prepare for an output pass.
+ * Here we select the proper IDCT routine for each component and build
+ * a matching multiplier table.
+ */
+
+METHODDEF(void)
+start_pass(j_decompress_ptr cinfo)
+{
+ my_idct_ptr idct = (my_idct_ptr)cinfo->idct;
+ int ci, i;
+ jpeg_component_info *compptr;
+ int method = 0;
+ inverse_DCT_method_ptr method_ptr = NULL;
+ JQUANT_TBL *qtbl;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ /* Select the proper IDCT routine for this component's scaling */
+ switch (compptr->_DCT_scaled_size) {
+#ifdef IDCT_SCALING_SUPPORTED
+ case 1:
+ method_ptr = jpeg_idct_1x1;
+ method = JDCT_ISLOW; /* jidctred uses islow-style table */
+ break;
+ case 2:
+ if (jsimd_can_idct_2x2())
+ method_ptr = jsimd_idct_2x2;
+ else
+ method_ptr = jpeg_idct_2x2;
+ method = JDCT_ISLOW; /* jidctred uses islow-style table */
+ break;
+ case 3:
+ method_ptr = jpeg_idct_3x3;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case 4:
+ if (jsimd_can_idct_4x4())
+ method_ptr = jsimd_idct_4x4;
+ else
+ method_ptr = jpeg_idct_4x4;
+ method = JDCT_ISLOW; /* jidctred uses islow-style table */
+ break;
+ case 5:
+ method_ptr = jpeg_idct_5x5;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case 6:
+#if defined(__mips__)
+ if (jsimd_can_idct_6x6())
+ method_ptr = jsimd_idct_6x6;
+ else
+#endif
+ method_ptr = jpeg_idct_6x6;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case 7:
+ method_ptr = jpeg_idct_7x7;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+#endif
+ case DCTSIZE:
+ switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+ case JDCT_ISLOW:
+ if (jsimd_can_idct_islow())
+ method_ptr = jsimd_idct_islow;
+ else
+ method_ptr = jpeg_idct_islow;
+ method = JDCT_ISLOW;
+ break;
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+ case JDCT_IFAST:
+ if (jsimd_can_idct_ifast())
+ method_ptr = jsimd_idct_ifast;
+ else
+ method_ptr = jpeg_idct_ifast;
+ method = JDCT_IFAST;
+ break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+ case JDCT_FLOAT:
+ if (jsimd_can_idct_float())
+ method_ptr = jsimd_idct_float;
+ else
+ method_ptr = jpeg_idct_float;
+ method = JDCT_FLOAT;
+ break;
+#endif
+ default:
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+ break;
+ }
+ break;
+#ifdef IDCT_SCALING_SUPPORTED
+ case 9:
+ method_ptr = jpeg_idct_9x9;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case 10:
+ method_ptr = jpeg_idct_10x10;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case 11:
+ method_ptr = jpeg_idct_11x11;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case 12:
+#if defined(__mips__)
+ if (jsimd_can_idct_12x12())
+ method_ptr = jsimd_idct_12x12;
+ else
+#endif
+ method_ptr = jpeg_idct_12x12;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case 13:
+ method_ptr = jpeg_idct_13x13;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case 14:
+ method_ptr = jpeg_idct_14x14;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case 15:
+ method_ptr = jpeg_idct_15x15;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+ case 16:
+ method_ptr = jpeg_idct_16x16;
+ method = JDCT_ISLOW; /* jidctint uses islow-style table */
+ break;
+#endif
+ default:
+ ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->_DCT_scaled_size);
+ break;
+ }
+ idct->pub.inverse_DCT[ci] = method_ptr;
+ /* Create multiplier table from quant table.
+ * However, we can skip this if the component is uninteresting
+ * or if we already built the table. Also, if no quant table
+ * has yet been saved for the component, we leave the
+ * multiplier table all-zero; we'll be reading zeroes from the
+ * coefficient controller's buffer anyway.
+ */
+ if (!compptr->component_needed || idct->cur_method[ci] == method)
+ continue;
+ qtbl = compptr->quant_table;
+ if (qtbl == NULL) /* happens if no data yet for component */
+ continue;
+ idct->cur_method[ci] = method;
+ switch (method) {
+#ifdef PROVIDE_ISLOW_TABLES
+ case JDCT_ISLOW:
+ {
+ /* For LL&M IDCT method, multipliers are equal to raw quantization
+ * coefficients, but are stored as ints to ensure access efficiency.
+ */
+ ISLOW_MULT_TYPE *ismtbl = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ for (i = 0; i < DCTSIZE2; i++) {
+ ismtbl[i] = (ISLOW_MULT_TYPE)qtbl->quantval[i];
+ }
+ }
+ break;
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+ case JDCT_IFAST:
+ {
+ /* For AA&N IDCT method, multipliers are equal to quantization
+ * coefficients scaled by scalefactor[row]*scalefactor[col], where
+ * scalefactor[0] = 1
+ * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
+ * For integer operation, the multiplier table is to be scaled by
+ * IFAST_SCALE_BITS.
+ */
+ IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *)compptr->dct_table;
+#define CONST_BITS 14
+ static const INT16 aanscales[DCTSIZE2] = {
+ /* precomputed values scaled up by 14 bits */
+ 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
+ 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
+ 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
+ 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
+ 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
+ 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552,
+ 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446,
+ 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247
+ };
+ SHIFT_TEMPS
+
+ for (i = 0; i < DCTSIZE2; i++) {
+ ifmtbl[i] = (IFAST_MULT_TYPE)
+ DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+ (JLONG)aanscales[i]),
+ CONST_BITS - IFAST_SCALE_BITS);
+ }
+ }
+ break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+ case JDCT_FLOAT:
+ {
+ /* For float AA&N IDCT method, multipliers are equal to quantization
+ * coefficients scaled by scalefactor[row]*scalefactor[col], where
+ * scalefactor[0] = 1
+ * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
+ */
+ FLOAT_MULT_TYPE *fmtbl = (FLOAT_MULT_TYPE *)compptr->dct_table;
+ int row, col;
+ static const double aanscalefactor[DCTSIZE] = {
+ 1.0, 1.387039845, 1.306562965, 1.175875602,
+ 1.0, 0.785694958, 0.541196100, 0.275899379
+ };
+
+ i = 0;
+ for (row = 0; row < DCTSIZE; row++) {
+ for (col = 0; col < DCTSIZE; col++) {
+ fmtbl[i] = (FLOAT_MULT_TYPE)
+ ((double)qtbl->quantval[i] *
+ aanscalefactor[row] * aanscalefactor[col]);
+ i++;
+ }
+ }
+ }
+ break;
+#endif
+ default:
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+ break;
+ }
+ }
+}
+
+
+/*
+ * Initialize IDCT manager.
+ */
+
+GLOBAL(void)
+jinit_inverse_dct(j_decompress_ptr cinfo)
+{
+ my_idct_ptr idct;
+ int ci;
+ jpeg_component_info *compptr;
+
+ idct = (my_idct_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_idct_controller));
+ cinfo->idct = (struct jpeg_inverse_dct *)idct;
+ idct->pub.start_pass = start_pass;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ /* Allocate and pre-zero a multiplier table for each component */
+ compptr->dct_table =
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(multiplier_table));
+ memset(compptr->dct_table, 0, sizeof(multiplier_table));
+ /* Mark multiplier table not yet set up for any method */
+ idct->cur_method[ci] = -1;
+ }
+}
diff --git a/media/libjpeg/jdhuff.c b/media/libjpeg/jdhuff.c
new file mode 100644
index 0000000000..679d221685
--- /dev/null
+++ b/media/libjpeg/jdhuff.c
@@ -0,0 +1,834 @@
+/*
+ * jdhuff.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009-2011, 2016, 2018-2019, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains Huffman entropy decoding routines.
+ *
+ * Much of the complexity here has to do with supporting input suspension.
+ * If the data source module demands suspension, we want to be able to back
+ * up to the start of the current MCU. To do this, we copy state variables
+ * into local working storage, and update them back to the permanent
+ * storage only upon successful completion of an MCU.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdhuff.h" /* Declarations shared with jdphuff.c */
+#include "jpegcomp.h"
+#include "jstdhuff.c"
+
+
+/*
+ * Expanded entropy decoder object for Huffman decoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+typedef struct {
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+ struct jpeg_entropy_decoder pub; /* public fields */
+
+ /* These fields are loaded into local variables at start of each MCU.
+ * In case of suspension, we exit WITHOUT updating them.
+ */
+ bitread_perm_state bitstate; /* Bit buffer at start of MCU */
+ savable_state saved; /* Other state at start of MCU */
+
+ /* These fields are NOT loaded into local working state. */
+ unsigned int restarts_to_go; /* MCUs left in this restart interval */
+
+ /* Pointers to derived tables (these workspaces have image lifespan) */
+ d_derived_tbl *dc_derived_tbls[NUM_HUFF_TBLS];
+ d_derived_tbl *ac_derived_tbls[NUM_HUFF_TBLS];
+
+ /* Precalculated info set up by start_pass for use in decode_mcu: */
+
+ /* Pointers to derived tables to be used for each block within an MCU */
+ d_derived_tbl *dc_cur_tbls[D_MAX_BLOCKS_IN_MCU];
+ d_derived_tbl *ac_cur_tbls[D_MAX_BLOCKS_IN_MCU];
+ /* Whether we care about the DC and AC coefficient values for each block */
+ boolean dc_needed[D_MAX_BLOCKS_IN_MCU];
+ boolean ac_needed[D_MAX_BLOCKS_IN_MCU];
+} huff_entropy_decoder;
+
+typedef huff_entropy_decoder *huff_entropy_ptr;
+
+
+/*
+ * Initialize for a Huffman-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass_huff_decoder(j_decompress_ptr cinfo)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
+ int ci, blkn, dctbl, actbl;
+ d_derived_tbl **pdtbl;
+ jpeg_component_info *compptr;
+
+ /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
+ * This ought to be an error condition, but we make it a warning because
+ * there are some baseline files out there with all zeroes in these bytes.
+ */
+ if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 ||
+ cinfo->Ah != 0 || cinfo->Al != 0)
+ WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ dctbl = compptr->dc_tbl_no;
+ actbl = compptr->ac_tbl_no;
+ /* Compute derived values for Huffman tables */
+ /* We may do this more than once for a table, but it's not expensive */
+ pdtbl = (d_derived_tbl **)(entropy->dc_derived_tbls) + dctbl;
+ jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl, pdtbl);
+ pdtbl = (d_derived_tbl **)(entropy->ac_derived_tbls) + actbl;
+ jpeg_make_d_derived_tbl(cinfo, FALSE, actbl, pdtbl);
+ /* Initialize DC predictions to 0 */
+ entropy->saved.last_dc_val[ci] = 0;
+ }
+
+ /* Precalculate decoding info for each block in an MCU of this scan */
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ ci = cinfo->MCU_membership[blkn];
+ compptr = cinfo->cur_comp_info[ci];
+ /* Precalculate which table to use for each block */
+ entropy->dc_cur_tbls[blkn] = entropy->dc_derived_tbls[compptr->dc_tbl_no];
+ entropy->ac_cur_tbls[blkn] = entropy->ac_derived_tbls[compptr->ac_tbl_no];
+ /* Decide whether we really care about the coefficient values */
+ if (compptr->component_needed) {
+ entropy->dc_needed[blkn] = TRUE;
+ /* we don't need the ACs if producing a 1/8th-size image */
+ entropy->ac_needed[blkn] = (compptr->_DCT_scaled_size > 1);
+ } else {
+ entropy->dc_needed[blkn] = entropy->ac_needed[blkn] = FALSE;
+ }
+ }
+
+ /* Initialize bitread state variables */
+ entropy->bitstate.bits_left = 0;
+ entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
+ entropy->pub.insufficient_data = FALSE;
+
+ /* Initialize restart counter */
+ entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Compute the derived values for a Huffman table.
+ * This routine also performs some validation checks on the table.
+ *
+ * Note this is also used by jdphuff.c.
+ */
+
+GLOBAL(void)
+jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC, int tblno,
+ d_derived_tbl **pdtbl)
+{
+ JHUFF_TBL *htbl;
+ d_derived_tbl *dtbl;
+ int p, i, l, si, numsymbols;
+ int lookbits, ctr;
+ char huffsize[257];
+ unsigned int huffcode[257];
+ unsigned int code;
+
+ /* Note that huffsize[] and huffcode[] are filled in code-length order,
+ * paralleling the order of the symbols themselves in htbl->huffval[].
+ */
+
+ /* Find the input Huffman table */
+ if (tblno < 0 || tblno >= NUM_HUFF_TBLS)
+ ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tblno);
+ htbl =
+ isDC ? cinfo->dc_huff_tbl_ptrs[tblno] : cinfo->ac_huff_tbl_ptrs[tblno];
+ if (htbl == NULL)
+ ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tblno);
+
+ /* Allocate a workspace if we haven't already done so. */
+ if (*pdtbl == NULL)
+ *pdtbl = (d_derived_tbl *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(d_derived_tbl));
+ dtbl = *pdtbl;
+ dtbl->pub = htbl; /* fill in back link */
+
+ /* Figure C.1: make table of Huffman code length for each symbol */
+
+ p = 0;
+ for (l = 1; l <= 16; l++) {
+ i = (int)htbl->bits[l];
+ if (i < 0 || p + i > 256) /* protect against table overrun */
+ ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
+ while (i--)
+ huffsize[p++] = (char)l;
+ }
+ huffsize[p] = 0;
+ numsymbols = p;
+
+ /* Figure C.2: generate the codes themselves */
+ /* We also validate that the counts represent a legal Huffman code tree. */
+
+ code = 0;
+ si = huffsize[0];
+ p = 0;
+ while (huffsize[p]) {
+ while (((int)huffsize[p]) == si) {
+ huffcode[p++] = code;
+ code++;
+ }
+ /* code is now 1 more than the last code used for codelength si; but
+ * it must still fit in si bits, since no code is allowed to be all ones.
+ */
+ if (((JLONG)code) >= (((JLONG)1) << si))
+ ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
+ code <<= 1;
+ si++;
+ }
+
+ /* Figure F.15: generate decoding tables for bit-sequential decoding */
+
+ p = 0;
+ for (l = 1; l <= 16; l++) {
+ if (htbl->bits[l]) {
+ /* valoffset[l] = huffval[] index of 1st symbol of code length l,
+ * minus the minimum code of length l
+ */
+ dtbl->valoffset[l] = (JLONG)p - (JLONG)huffcode[p];
+ p += htbl->bits[l];
+ dtbl->maxcode[l] = huffcode[p - 1]; /* maximum code of length l */
+ } else {
+ dtbl->maxcode[l] = -1; /* -1 if no codes of this length */
+ }
+ }
+ dtbl->valoffset[17] = 0;
+ dtbl->maxcode[17] = 0xFFFFFL; /* ensures jpeg_huff_decode terminates */
+
+ /* Compute lookahead tables to speed up decoding.
+ * First we set all the table entries to 0, indicating "too long";
+ * then we iterate through the Huffman codes that are short enough and
+ * fill in all the entries that correspond to bit sequences starting
+ * with that code.
+ */
+
+ for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
+ dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
+
+ p = 0;
+ for (l = 1; l <= HUFF_LOOKAHEAD; l++) {
+ for (i = 1; i <= (int)htbl->bits[l]; i++, p++) {
+ /* l = current code's length, p = its index in huffcode[] & huffval[]. */
+ /* Generate left-justified code followed by all possible bit sequences */
+ lookbits = huffcode[p] << (HUFF_LOOKAHEAD - l);
+ for (ctr = 1 << (HUFF_LOOKAHEAD - l); ctr > 0; ctr--) {
+ dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p];
+ lookbits++;
+ }
+ }
+ }
+
+ /* Validate symbols as being reasonable.
+ * For AC tables, we make no check, but accept all byte values 0..255.
+ * For DC tables, we require the symbols to be in range 0..15.
+ * (Tighter bounds could be applied depending on the data depth and mode,
+ * but this is sufficient to ensure safe decoding.)
+ */
+ if (isDC) {
+ for (i = 0; i < numsymbols; i++) {
+ int sym = htbl->huffval[i];
+ if (sym < 0 || sym > 15)
+ ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
+ }
+ }
+}
+
+
+/*
+ * Out-of-line code for bit fetching (shared with jdphuff.c).
+ * See jdhuff.h for info about usage.
+ * Note: current values of get_buffer and bits_left are passed as parameters,
+ * but are returned in the corresponding fields of the state struct.
+ *
+ * On most machines MIN_GET_BITS should be 25 to allow the full 32-bit width
+ * of get_buffer to be used. (On machines with wider words, an even larger
+ * buffer could be used.) However, on some machines 32-bit shifts are
+ * quite slow and take time proportional to the number of places shifted.
+ * (This is true with most PC compilers, for instance.) In this case it may
+ * be a win to set MIN_GET_BITS to the minimum value of 15. This reduces the
+ * average shift distance at the cost of more calls to jpeg_fill_bit_buffer.
+ */
+
+#ifdef SLOW_SHIFT_32
+#define MIN_GET_BITS 15 /* minimum allowable value */
+#else
+#define MIN_GET_BITS (BIT_BUF_SIZE - 7)
+#endif
+
+
+GLOBAL(boolean)
+jpeg_fill_bit_buffer(bitread_working_state *state,
+ register bit_buf_type get_buffer, register int bits_left,
+ int nbits)
+/* Load up the bit buffer to a depth of at least nbits */
+{
+ /* Copy heavily used state fields into locals (hopefully registers) */
+ register const JOCTET *next_input_byte = state->next_input_byte;
+ register size_t bytes_in_buffer = state->bytes_in_buffer;
+ j_decompress_ptr cinfo = state->cinfo;
+
+ /* Attempt to load at least MIN_GET_BITS bits into get_buffer. */
+ /* (It is assumed that no request will be for more than that many bits.) */
+ /* We fail to do so only if we hit a marker or are forced to suspend. */
+
+ if (cinfo->unread_marker == 0) { /* cannot advance past a marker */
+ while (bits_left < MIN_GET_BITS) {
+ register int c;
+
+ /* Attempt to read a byte */
+ if (bytes_in_buffer == 0) {
+ if (!(*cinfo->src->fill_input_buffer) (cinfo))
+ return FALSE;
+ next_input_byte = cinfo->src->next_input_byte;
+ bytes_in_buffer = cinfo->src->bytes_in_buffer;
+ }
+ bytes_in_buffer--;
+ c = *next_input_byte++;
+
+ /* If it's 0xFF, check and discard stuffed zero byte */
+ if (c == 0xFF) {
+ /* Loop here to discard any padding FF's on terminating marker,
+ * so that we can save a valid unread_marker value. NOTE: we will
+ * accept multiple FF's followed by a 0 as meaning a single FF data
+ * byte. This data pattern is not valid according to the standard.
+ */
+ do {
+ if (bytes_in_buffer == 0) {
+ if (!(*cinfo->src->fill_input_buffer) (cinfo))
+ return FALSE;
+ next_input_byte = cinfo->src->next_input_byte;
+ bytes_in_buffer = cinfo->src->bytes_in_buffer;
+ }
+ bytes_in_buffer--;
+ c = *next_input_byte++;
+ } while (c == 0xFF);
+
+ if (c == 0) {
+ /* Found FF/00, which represents an FF data byte */
+ c = 0xFF;
+ } else {
+ /* Oops, it's actually a marker indicating end of compressed data.
+ * Save the marker code for later use.
+ * Fine point: it might appear that we should save the marker into
+ * bitread working state, not straight into permanent state. But
+ * once we have hit a marker, we cannot need to suspend within the
+ * current MCU, because we will read no more bytes from the data
+ * source. So it is OK to update permanent state right away.
+ */
+ cinfo->unread_marker = c;
+ /* See if we need to insert some fake zero bits. */
+ goto no_more_bytes;
+ }
+ }
+
+ /* OK, load c into get_buffer */
+ get_buffer = (get_buffer << 8) | c;
+ bits_left += 8;
+ } /* end while */
+ } else {
+no_more_bytes:
+ /* We get here if we've read the marker that terminates the compressed
+ * data segment. There should be enough bits in the buffer register
+ * to satisfy the request; if so, no problem.
+ */
+ if (nbits > bits_left) {
+ /* Uh-oh. Report corrupted data to user and stuff zeroes into
+ * the data stream, so that we can produce some kind of image.
+ * We use a nonvolatile flag to ensure that only one warning message
+ * appears per data segment.
+ */
+ if (!cinfo->entropy->insufficient_data) {
+ WARNMS(cinfo, JWRN_HIT_MARKER);
+ cinfo->entropy->insufficient_data = TRUE;
+ }
+ /* Fill the buffer with zero bits */
+ get_buffer <<= MIN_GET_BITS - bits_left;
+ bits_left = MIN_GET_BITS;
+ }
+ }
+
+ /* Unload the local registers */
+ state->next_input_byte = next_input_byte;
+ state->bytes_in_buffer = bytes_in_buffer;
+ state->get_buffer = get_buffer;
+ state->bits_left = bits_left;
+
+ return TRUE;
+}
+
+
+/* Macro version of the above, which performs much better but does not
+ handle markers. We have to hand off any blocks with markers to the
+ slower routines. */
+
+#define GET_BYTE { \
+ register int c0, c1; \
+ c0 = *buffer++; \
+ c1 = *buffer; \
+ /* Pre-execute most common case */ \
+ get_buffer = (get_buffer << 8) | c0; \
+ bits_left += 8; \
+ if (c0 == 0xFF) { \
+ /* Pre-execute case of FF/00, which represents an FF data byte */ \
+ buffer++; \
+ if (c1 != 0) { \
+ /* Oops, it's actually a marker indicating end of compressed data. */ \
+ cinfo->unread_marker = c1; \
+ /* Back out pre-execution and fill the buffer with zero bits */ \
+ buffer -= 2; \
+ get_buffer &= ~0xFF; \
+ } \
+ } \
+}
+
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64) || (defined(__x86_64__) && defined(__ILP32__))
+
+/* Pre-fetch 48 bytes, because the holding register is 64-bit */
+#define FILL_BIT_BUFFER_FAST \
+ if (bits_left <= 16) { \
+ GET_BYTE GET_BYTE GET_BYTE GET_BYTE GET_BYTE GET_BYTE \
+ }
+
+#else
+
+/* Pre-fetch 16 bytes, because the holding register is 32-bit */
+#define FILL_BIT_BUFFER_FAST \
+ if (bits_left <= 16) { \
+ GET_BYTE GET_BYTE \
+ }
+
+#endif
+
+
+/*
+ * Out-of-line code for Huffman code decoding.
+ * See jdhuff.h for info about usage.
+ */
+
+GLOBAL(int)
+jpeg_huff_decode(bitread_working_state *state,
+ register bit_buf_type get_buffer, register int bits_left,
+ d_derived_tbl *htbl, int min_bits)
+{
+ register int l = min_bits;
+ register JLONG code;
+
+ /* HUFF_DECODE has determined that the code is at least min_bits */
+ /* bits long, so fetch that many bits in one swoop. */
+
+ CHECK_BIT_BUFFER(*state, l, return -1);
+ code = GET_BITS(l);
+
+ /* Collect the rest of the Huffman code one bit at a time. */
+ /* This is per Figure F.16. */
+
+ while (code > htbl->maxcode[l]) {
+ code <<= 1;
+ CHECK_BIT_BUFFER(*state, 1, return -1);
+ code |= GET_BITS(1);
+ l++;
+ }
+
+ /* Unload the local registers */
+ state->get_buffer = get_buffer;
+ state->bits_left = bits_left;
+
+ /* With garbage input we may reach the sentinel value l = 17. */
+
+ if (l > 16) {
+ WARNMS(state->cinfo, JWRN_HUFF_BAD_CODE);
+ return 0; /* fake a zero as the safest result */
+ }
+
+ return htbl->pub->huffval[(int)(code + htbl->valoffset[l])];
+}
+
+
+/*
+ * Figure F.12: extend sign bit.
+ * On some machines, a shift and add will be faster than a table lookup.
+ */
+
+#define AVOID_TABLES
+#ifdef AVOID_TABLES
+
+#define NEG_1 ((unsigned int)-1)
+#define HUFF_EXTEND(x, s) \
+ ((x) + ((((x) - (1 << ((s) - 1))) >> 31) & (((NEG_1) << (s)) + 1)))
+
+#else
+
+#define HUFF_EXTEND(x, s) \
+ ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
+
+static const int extend_test[16] = { /* entry n is 2**(n-1) */
+ 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
+ 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+};
+
+static const int extend_offset[16] = { /* entry n is (-1 << n) + 1 */
+ 0, ((-1) << 1) + 1, ((-1) << 2) + 1, ((-1) << 3) + 1, ((-1) << 4) + 1,
+ ((-1) << 5) + 1, ((-1) << 6) + 1, ((-1) << 7) + 1, ((-1) << 8) + 1,
+ ((-1) << 9) + 1, ((-1) << 10) + 1, ((-1) << 11) + 1, ((-1) << 12) + 1,
+ ((-1) << 13) + 1, ((-1) << 14) + 1, ((-1) << 15) + 1
+};
+
+#endif /* AVOID_TABLES */
+
+
+/*
+ * Check for a restart marker & resynchronize decoder.
+ * Returns FALSE if must suspend.
+ */
+
+LOCAL(boolean)
+process_restart(j_decompress_ptr cinfo)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
+ int ci;
+
+ /* Throw away any unused bits remaining in bit buffer; */
+ /* include any full bytes in next_marker's count of discarded bytes */
+ cinfo->marker->discarded_bytes += entropy->bitstate.bits_left / 8;
+ entropy->bitstate.bits_left = 0;
+
+ /* Advance past the RSTn marker */
+ if (!(*cinfo->marker->read_restart_marker) (cinfo))
+ return FALSE;
+
+ /* Re-initialize DC predictions to 0 */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++)
+ entropy->saved.last_dc_val[ci] = 0;
+
+ /* Reset restart counter */
+ entropy->restarts_to_go = cinfo->restart_interval;
+
+ /* Reset out-of-data flag, unless read_restart_marker left us smack up
+ * against a marker. In that case we will end up treating the next data
+ * segment as empty, and we can avoid producing bogus output pixels by
+ * leaving the flag set.
+ */
+ if (cinfo->unread_marker == 0)
+ entropy->pub.insufficient_data = FALSE;
+
+ return TRUE;
+}
+
+
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("signed-integer-overflow"),
+ no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+LOCAL(boolean)
+decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
+ BITREAD_STATE_VARS;
+ int blkn;
+ savable_state state;
+ /* Outer loop handles each block in the MCU */
+
+ /* Load up working state */
+ BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+ state = entropy->saved;
+
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
+ d_derived_tbl *dctbl = entropy->dc_cur_tbls[blkn];
+ d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn];
+ register int s, k, r;
+
+ /* Decode a single block's worth of coefficients */
+
+ /* Section F.2.2.1: decode the DC coefficient difference */
+ HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
+ if (s) {
+ CHECK_BIT_BUFFER(br_state, s, return FALSE);
+ r = GET_BITS(s);
+ s = HUFF_EXTEND(r, s);
+ }
+
+ if (entropy->dc_needed[blkn]) {
+ /* Convert DC difference to actual value, update last_dc_val */
+ int ci = cinfo->MCU_membership[blkn];
+ /* Certain malformed JPEG images produce repeated DC coefficient
+ * differences of 2047 or -2047, which causes state.last_dc_val[ci] to
+ * grow until it overflows or underflows a 32-bit signed integer. This
+ * behavior is, to the best of our understanding, innocuous, and it is
+ * unclear how to work around it without potentially affecting
+ * performance. Thus, we (hopefully temporarily) suppress UBSan integer
+ * overflow errors for this function and decode_mcu_fast().
+ */
+ s += state.last_dc_val[ci];
+ state.last_dc_val[ci] = s;
+ if (block) {
+ /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
+ (*block)[0] = (JCOEF)s;
+ }
+ }
+
+ if (entropy->ac_needed[blkn] && block) {
+
+ /* Section F.2.2.2: decode the AC coefficients */
+ /* Since zeroes are skipped, output area must be cleared beforehand */
+ for (k = 1; k < DCTSIZE2; k++) {
+ HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
+
+ r = s >> 4;
+ s &= 15;
+
+ if (s) {
+ k += r;
+ CHECK_BIT_BUFFER(br_state, s, return FALSE);
+ r = GET_BITS(s);
+ s = HUFF_EXTEND(r, s);
+ /* Output coefficient in natural (dezigzagged) order.
+ * Note: the extra entries in jpeg_natural_order[] will save us
+ * if k >= DCTSIZE2, which could happen if the data is corrupted.
+ */
+ (*block)[jpeg_natural_order[k]] = (JCOEF)s;
+ } else {
+ if (r != 15)
+ break;
+ k += 15;
+ }
+ }
+
+ } else {
+
+ /* Section F.2.2.2: decode the AC coefficients */
+ /* In this path we just discard the values */
+ for (k = 1; k < DCTSIZE2; k++) {
+ HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
+
+ r = s >> 4;
+ s &= 15;
+
+ if (s) {
+ k += r;
+ CHECK_BIT_BUFFER(br_state, s, return FALSE);
+ DROP_BITS(s);
+ } else {
+ if (r != 15)
+ break;
+ k += 15;
+ }
+ }
+ }
+ }
+
+ /* Completed MCU, so update state */
+ BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+ entropy->saved = state;
+ return TRUE;
+}
+
+
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("signed-integer-overflow"),
+ no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+LOCAL(boolean)
+decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
+ BITREAD_STATE_VARS;
+ JOCTET *buffer;
+ int blkn;
+ savable_state state;
+ /* Outer loop handles each block in the MCU */
+
+ /* Load up working state */
+ BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+ buffer = (JOCTET *)br_state.next_input_byte;
+ state = entropy->saved;
+
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
+ d_derived_tbl *dctbl = entropy->dc_cur_tbls[blkn];
+ d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn];
+ register int s, k, r, l;
+
+ HUFF_DECODE_FAST(s, l, dctbl);
+ if (s) {
+ FILL_BIT_BUFFER_FAST
+ r = GET_BITS(s);
+ s = HUFF_EXTEND(r, s);
+ }
+
+ if (entropy->dc_needed[blkn]) {
+ int ci = cinfo->MCU_membership[blkn];
+ /* Refer to the comment in decode_mcu_slow() regarding the supression of
+ * a UBSan integer overflow error in this line of code.
+ */
+ s += state.last_dc_val[ci];
+ state.last_dc_val[ci] = s;
+ if (block)
+ (*block)[0] = (JCOEF)s;
+ }
+
+ if (entropy->ac_needed[blkn] && block) {
+
+ for (k = 1; k < DCTSIZE2; k++) {
+ HUFF_DECODE_FAST(s, l, actbl);
+ r = s >> 4;
+ s &= 15;
+
+ if (s) {
+ k += r;
+ FILL_BIT_BUFFER_FAST
+ r = GET_BITS(s);
+ s = HUFF_EXTEND(r, s);
+ (*block)[jpeg_natural_order[k]] = (JCOEF)s;
+ } else {
+ if (r != 15) break;
+ k += 15;
+ }
+ }
+
+ } else {
+
+ for (k = 1; k < DCTSIZE2; k++) {
+ HUFF_DECODE_FAST(s, l, actbl);
+ r = s >> 4;
+ s &= 15;
+
+ if (s) {
+ k += r;
+ FILL_BIT_BUFFER_FAST
+ DROP_BITS(s);
+ } else {
+ if (r != 15) break;
+ k += 15;
+ }
+ }
+ }
+ }
+
+ if (cinfo->unread_marker != 0) {
+ cinfo->unread_marker = 0;
+ return FALSE;
+ }
+
+ br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
+ br_state.next_input_byte = buffer;
+ BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+ entropy->saved = state;
+ return TRUE;
+}
+
+
+/*
+ * Decode and return one MCU's worth of Huffman-compressed coefficients.
+ * The coefficients are reordered from zigzag order into natural array order,
+ * but are not dequantized.
+ *
+ * The i'th block of the MCU is stored into the block pointed to by
+ * MCU_data[i]. WE ASSUME THIS AREA HAS BEEN ZEROED BY THE CALLER.
+ * (Wholesale zeroing is usually a little faster than retail...)
+ *
+ * Returns FALSE if data source requested suspension. In that case no
+ * changes have been made to permanent state. (Exception: some output
+ * coefficients may already have been assigned. This is harmless for
+ * this module, since we'll just re-assign them on the next call.)
+ */
+
+#define BUFSIZE (DCTSIZE2 * 8)
+
+METHODDEF(boolean)
+decode_mcu(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
+ int usefast = 1;
+
+ /* Process restart marker if needed; may have to suspend */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ if (!process_restart(cinfo))
+ return FALSE;
+ usefast = 0;
+ }
+
+ if (cinfo->src->bytes_in_buffer < BUFSIZE * (size_t)cinfo->blocks_in_MCU ||
+ cinfo->unread_marker != 0)
+ usefast = 0;
+
+ /* If we've run out of data, just leave the MCU set to zeroes.
+ * This way, we return uniform gray for the remainder of the segment.
+ */
+ if (!entropy->pub.insufficient_data) {
+
+ if (usefast) {
+ if (!decode_mcu_fast(cinfo, MCU_data)) goto use_slow;
+ } else {
+use_slow:
+ if (!decode_mcu_slow(cinfo, MCU_data)) return FALSE;
+ }
+
+ }
+
+ /* Account for restart interval (no-op if not using restarts) */
+ if (cinfo->restart_interval)
+ entropy->restarts_to_go--;
+
+ return TRUE;
+}
+
+
+/*
+ * Module initialization routine for Huffman entropy decoding.
+ */
+
+GLOBAL(void)
+jinit_huff_decoder(j_decompress_ptr cinfo)
+{
+ huff_entropy_ptr entropy;
+ int i;
+
+ /* Motion JPEG frames typically do not include the Huffman tables if they
+ are the default tables. Thus, if the tables are not set by the time
+ the Huffman decoder is initialized (usually within the body of
+ jpeg_start_decompress()), we set them to default values. */
+ std_huff_tables((j_common_ptr)cinfo);
+
+ entropy = (huff_entropy_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(huff_entropy_decoder));
+ cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
+ entropy->pub.start_pass = start_pass_huff_decoder;
+ entropy->pub.decode_mcu = decode_mcu;
+
+ /* Mark tables unallocated */
+ for (i = 0; i < NUM_HUFF_TBLS; i++) {
+ entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
+ }
+}
diff --git a/media/libjpeg/jdhuff.h b/media/libjpeg/jdhuff.h
new file mode 100644
index 0000000000..cfa0b7f558
--- /dev/null
+++ b/media/libjpeg/jdhuff.h
@@ -0,0 +1,247 @@
+/*
+ * jdhuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010-2011, 2015-2016, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains declarations for Huffman entropy decoding routines
+ * that are shared between the sequential decoder (jdhuff.c) and the
+ * progressive decoder (jdphuff.c). No other modules need to see these.
+ */
+
+#include "jconfigint.h"
+
+
+/* Derived data constructed for each Huffman table */
+
+#define HUFF_LOOKAHEAD 8 /* # of bits of lookahead */
+
+typedef struct {
+ /* Basic tables: (element [0] of each array is unused) */
+ JLONG maxcode[18]; /* largest code of length k (-1 if none) */
+ /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */
+ JLONG valoffset[18]; /* huffval[] offset for codes of length k */
+ /* valoffset[k] = huffval[] index of 1st symbol of code length k, less
+ * the smallest code of length k; so given a code of length k, the
+ * corresponding symbol is huffval[code + valoffset[k]]
+ */
+
+ /* Link to public Huffman table (needed only in jpeg_huff_decode) */
+ JHUFF_TBL *pub;
+
+ /* Lookahead table: indexed by the next HUFF_LOOKAHEAD bits of
+ * the input data stream. If the next Huffman code is no more
+ * than HUFF_LOOKAHEAD bits long, we can obtain its length and
+ * the corresponding symbol directly from this tables.
+ *
+ * The lower 8 bits of each table entry contain the number of
+ * bits in the corresponding Huffman code, or HUFF_LOOKAHEAD + 1
+ * if too long. The next 8 bits of each entry contain the
+ * symbol.
+ */
+ int lookup[1 << HUFF_LOOKAHEAD];
+} d_derived_tbl;
+
+/* Expand a Huffman table definition into the derived format */
+EXTERN(void) jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC,
+ int tblno, d_derived_tbl **pdtbl);
+
+
+/*
+ * Fetching the next N bits from the input stream is a time-critical operation
+ * for the Huffman decoders. We implement it with a combination of inline
+ * macros and out-of-line subroutines. Note that N (the number of bits
+ * demanded at one time) never exceeds 15 for JPEG use.
+ *
+ * We read source bytes into get_buffer and dole out bits as needed.
+ * If get_buffer already contains enough bits, they are fetched in-line
+ * by the macros CHECK_BIT_BUFFER and GET_BITS. When there aren't enough
+ * bits, jpeg_fill_bit_buffer is called; it will attempt to fill get_buffer
+ * as full as possible (not just to the number of bits needed; this
+ * prefetching reduces the overhead cost of calling jpeg_fill_bit_buffer).
+ * Note that jpeg_fill_bit_buffer may return FALSE to indicate suspension.
+ * On TRUE return, jpeg_fill_bit_buffer guarantees that get_buffer contains
+ * at least the requested number of bits --- dummy zeroes are inserted if
+ * necessary.
+ */
+
+#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T)
+#error Cannot determine word size
+#endif
+
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
+
+typedef size_t bit_buf_type; /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE 64 /* size of buffer in bits */
+
+#elif defined(__x86_64__) && defined(__ILP32__)
+
+typedef unsigned long long bit_buf_type; /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE 64 /* size of buffer in bits */
+
+#else
+
+typedef unsigned long bit_buf_type; /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE 32 /* size of buffer in bits */
+
+#endif
+
+/* If long is > 32 bits on your machine, and shifting/masking longs is
+ * reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
+ * appropriately should be a win. Unfortunately we can't define the size
+ * with something like #define BIT_BUF_SIZE (sizeof(bit_buf_type)*8)
+ * because not all machines measure sizeof in 8-bit bytes.
+ */
+
+typedef struct { /* Bitreading state saved across MCUs */
+ bit_buf_type get_buffer; /* current bit-extraction buffer */
+ int bits_left; /* # of unused bits in it */
+} bitread_perm_state;
+
+typedef struct { /* Bitreading working state within an MCU */
+ /* Current data source location */
+ /* We need a copy, rather than munging the original, in case of suspension */
+ const JOCTET *next_input_byte; /* => next byte to read from source */
+ size_t bytes_in_buffer; /* # of bytes remaining in source buffer */
+ /* Bit input buffer --- note these values are kept in register variables,
+ * not in this struct, inside the inner loops.
+ */
+ bit_buf_type get_buffer; /* current bit-extraction buffer */
+ int bits_left; /* # of unused bits in it */
+ /* Pointer needed by jpeg_fill_bit_buffer. */
+ j_decompress_ptr cinfo; /* back link to decompress master record */
+} bitread_working_state;
+
+/* Macros to declare and load/save bitread local variables. */
+#define BITREAD_STATE_VARS \
+ register bit_buf_type get_buffer; \
+ register int bits_left; \
+ bitread_working_state br_state
+
+#define BITREAD_LOAD_STATE(cinfop, permstate) \
+ br_state.cinfo = cinfop; \
+ br_state.next_input_byte = cinfop->src->next_input_byte; \
+ br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
+ get_buffer = permstate.get_buffer; \
+ bits_left = permstate.bits_left;
+
+#define BITREAD_SAVE_STATE(cinfop, permstate) \
+ cinfop->src->next_input_byte = br_state.next_input_byte; \
+ cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
+ permstate.get_buffer = get_buffer; \
+ permstate.bits_left = bits_left
+
+/*
+ * These macros provide the in-line portion of bit fetching.
+ * Use CHECK_BIT_BUFFER to ensure there are N bits in get_buffer
+ * before using GET_BITS, PEEK_BITS, or DROP_BITS.
+ * The variables get_buffer and bits_left are assumed to be locals,
+ * but the state struct might not be (jpeg_huff_decode needs this).
+ * CHECK_BIT_BUFFER(state, n, action);
+ * Ensure there are N bits in get_buffer; if suspend, take action.
+ * val = GET_BITS(n);
+ * Fetch next N bits.
+ * val = PEEK_BITS(n);
+ * Fetch next N bits without removing them from the buffer.
+ * DROP_BITS(n);
+ * Discard next N bits.
+ * The value N should be a simple variable, not an expression, because it
+ * is evaluated multiple times.
+ */
+
+#define CHECK_BIT_BUFFER(state, nbits, action) { \
+ if (bits_left < (nbits)) { \
+ if (!jpeg_fill_bit_buffer(&(state), get_buffer, bits_left, nbits)) \
+ { action; } \
+ get_buffer = (state).get_buffer; bits_left = (state).bits_left; \
+ } \
+}
+
+#define GET_BITS(nbits) \
+ (((int)(get_buffer >> (bits_left -= (nbits)))) & ((1 << (nbits)) - 1))
+
+#define PEEK_BITS(nbits) \
+ (((int)(get_buffer >> (bits_left - (nbits)))) & ((1 << (nbits)) - 1))
+
+#define DROP_BITS(nbits) \
+ (bits_left -= (nbits))
+
+/* Load up the bit buffer to a depth of at least nbits */
+EXTERN(boolean) jpeg_fill_bit_buffer(bitread_working_state *state,
+ register bit_buf_type get_buffer,
+ register int bits_left, int nbits);
+
+
+/*
+ * Code for extracting next Huffman-coded symbol from input bit stream.
+ * Again, this is time-critical and we make the main paths be macros.
+ *
+ * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits
+ * without looping. Usually, more than 95% of the Huffman codes will be 8
+ * or fewer bits long. The few overlength codes are handled with a loop,
+ * which need not be inline code.
+ *
+ * Notes about the HUFF_DECODE macro:
+ * 1. Near the end of the data segment, we may fail to get enough bits
+ * for a lookahead. In that case, we do it the hard way.
+ * 2. If the lookahead table contains no entry, the next code must be
+ * more than HUFF_LOOKAHEAD bits long.
+ * 3. jpeg_huff_decode returns -1 if forced to suspend.
+ */
+
+#define HUFF_DECODE(result, state, htbl, failaction, slowlabel) { \
+ register int nb, look; \
+ if (bits_left < HUFF_LOOKAHEAD) { \
+ if (!jpeg_fill_bit_buffer(&state, get_buffer, bits_left, 0)) \
+ { failaction; } \
+ get_buffer = state.get_buffer; bits_left = state.bits_left; \
+ if (bits_left < HUFF_LOOKAHEAD) { \
+ nb = 1; goto slowlabel; \
+ } \
+ } \
+ look = PEEK_BITS(HUFF_LOOKAHEAD); \
+ if ((nb = (htbl->lookup[look] >> HUFF_LOOKAHEAD)) <= HUFF_LOOKAHEAD) { \
+ DROP_BITS(nb); \
+ result = htbl->lookup[look] & ((1 << HUFF_LOOKAHEAD) - 1); \
+ } else { \
+slowlabel: \
+ if ((result = \
+ jpeg_huff_decode(&state, get_buffer, bits_left, htbl, nb)) < 0) \
+ { failaction; } \
+ get_buffer = state.get_buffer; bits_left = state.bits_left; \
+ } \
+}
+
+#define HUFF_DECODE_FAST(s, nb, htbl) \
+ FILL_BIT_BUFFER_FAST; \
+ s = PEEK_BITS(HUFF_LOOKAHEAD); \
+ s = htbl->lookup[s]; \
+ nb = s >> HUFF_LOOKAHEAD; \
+ /* Pre-execute the common case of nb <= HUFF_LOOKAHEAD */ \
+ DROP_BITS(nb); \
+ s = s & ((1 << HUFF_LOOKAHEAD) - 1); \
+ if (nb > HUFF_LOOKAHEAD) { \
+ /* Equivalent of jpeg_huff_decode() */ \
+ /* Don't use GET_BITS() here because we don't want to modify bits_left */ \
+ s = (get_buffer >> bits_left) & ((1 << (nb)) - 1); \
+ while (s > htbl->maxcode[nb]) { \
+ s <<= 1; \
+ s |= GET_BITS(1); \
+ nb++; \
+ } \
+ if (nb > 16) \
+ s = 0; \
+ else \
+ s = htbl->pub->huffval[(int)(s + htbl->valoffset[nb]) & 0xFF]; \
+ }
+
+/* Out-of-line case for Huffman code fetching */
+EXTERN(int) jpeg_huff_decode(bitread_working_state *state,
+ register bit_buf_type get_buffer,
+ register int bits_left, d_derived_tbl *htbl,
+ int min_bits);
diff --git a/media/libjpeg/jdicc.c b/media/libjpeg/jdicc.c
new file mode 100644
index 0000000000..50aa9a9676
--- /dev/null
+++ b/media/libjpeg/jdicc.c
@@ -0,0 +1,167 @@
+/*
+ * jdicc.c
+ *
+ * Copyright (C) 1997-1998, Thomas G. Lane, Todd Newman.
+ * Copyright (C) 2017, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides code to read International Color Consortium (ICC) device
+ * profiles embedded in JFIF JPEG image files. The ICC has defined a standard
+ * for including such data in JPEG "APP2" markers. The code given here does
+ * not know anything about the internal structure of the ICC profile data; it
+ * just knows how to get the profile data from a JPEG file while reading it.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+
+#define ICC_MARKER (JPEG_APP0 + 2) /* JPEG marker code for ICC */
+#define ICC_OVERHEAD_LEN 14 /* size of non-profile data in APP2 */
+
+
+/*
+ * Handy subroutine to test whether a saved marker is an ICC profile marker.
+ */
+
+LOCAL(boolean)
+marker_is_icc(jpeg_saved_marker_ptr marker)
+{
+ return
+ marker->marker == ICC_MARKER &&
+ marker->data_length >= ICC_OVERHEAD_LEN &&
+ /* verify the identifying string */
+ marker->data[0] == 0x49 &&
+ marker->data[1] == 0x43 &&
+ marker->data[2] == 0x43 &&
+ marker->data[3] == 0x5F &&
+ marker->data[4] == 0x50 &&
+ marker->data[5] == 0x52 &&
+ marker->data[6] == 0x4F &&
+ marker->data[7] == 0x46 &&
+ marker->data[8] == 0x49 &&
+ marker->data[9] == 0x4C &&
+ marker->data[10] == 0x45 &&
+ marker->data[11] == 0x0;
+}
+
+
+/*
+ * See if there was an ICC profile in the JPEG file being read; if so,
+ * reassemble and return the profile data.
+ *
+ * TRUE is returned if an ICC profile was found, FALSE if not. If TRUE is
+ * returned, *icc_data_ptr is set to point to the returned data, and
+ * *icc_data_len is set to its length.
+ *
+ * IMPORTANT: the data at *icc_data_ptr is allocated with malloc() and must be
+ * freed by the caller with free() when the caller no longer needs it.
+ * (Alternatively, we could write this routine to use the IJG library's memory
+ * allocator, so that the data would be freed implicitly when
+ * jpeg_finish_decompress() is called. But it seems likely that many
+ * applications will prefer to have the data stick around after decompression
+ * finishes.)
+ */
+
+GLOBAL(boolean)
+jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
+ unsigned int *icc_data_len)
+{
+ jpeg_saved_marker_ptr marker;
+ int num_markers = 0;
+ int seq_no;
+ JOCTET *icc_data;
+ unsigned int total_length;
+#define MAX_SEQ_NO 255 /* sufficient since marker numbers are bytes */
+ char marker_present[MAX_SEQ_NO + 1]; /* 1 if marker found */
+ unsigned int data_length[MAX_SEQ_NO + 1]; /* size of profile data in marker */
+ unsigned int data_offset[MAX_SEQ_NO + 1]; /* offset for data in marker */
+
+ if (icc_data_ptr == NULL || icc_data_len == NULL)
+ ERREXIT(cinfo, JERR_BUFFER_SIZE);
+ if (cinfo->global_state < DSTATE_READY)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ *icc_data_ptr = NULL; /* avoid confusion if FALSE return */
+ *icc_data_len = 0;
+
+ /* This first pass over the saved markers discovers whether there are
+ * any ICC markers and verifies the consistency of the marker numbering.
+ */
+
+ for (seq_no = 1; seq_no <= MAX_SEQ_NO; seq_no++)
+ marker_present[seq_no] = 0;
+
+ for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
+ if (marker_is_icc(marker)) {
+ if (num_markers == 0)
+ num_markers = marker->data[13];
+ else if (num_markers != marker->data[13]) {
+ WARNMS(cinfo, JWRN_BOGUS_ICC); /* inconsistent num_markers fields */
+ return FALSE;
+ }
+ seq_no = marker->data[12];
+ if (seq_no <= 0 || seq_no > num_markers) {
+ WARNMS(cinfo, JWRN_BOGUS_ICC); /* bogus sequence number */
+ return FALSE;
+ }
+ if (marker_present[seq_no]) {
+ WARNMS(cinfo, JWRN_BOGUS_ICC); /* duplicate sequence numbers */
+ return FALSE;
+ }
+ marker_present[seq_no] = 1;
+ data_length[seq_no] = marker->data_length - ICC_OVERHEAD_LEN;
+ }
+ }
+
+ if (num_markers == 0)
+ return FALSE;
+
+ /* Check for missing markers, count total space needed,
+ * compute offset of each marker's part of the data.
+ */
+
+ total_length = 0;
+ for (seq_no = 1; seq_no <= num_markers; seq_no++) {
+ if (marker_present[seq_no] == 0) {
+ WARNMS(cinfo, JWRN_BOGUS_ICC); /* missing sequence number */
+ return FALSE;
+ }
+ data_offset[seq_no] = total_length;
+ total_length += data_length[seq_no];
+ }
+
+ if (total_length == 0) {
+ WARNMS(cinfo, JWRN_BOGUS_ICC); /* found only empty markers? */
+ return FALSE;
+ }
+
+ /* Allocate space for assembled data */
+ icc_data = (JOCTET *)malloc(total_length * sizeof(JOCTET));
+ if (icc_data == NULL)
+ ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 11); /* oops, out of memory */
+
+ /* and fill it in */
+ for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
+ if (marker_is_icc(marker)) {
+ JOCTET FAR *src_ptr;
+ JOCTET *dst_ptr;
+ unsigned int length;
+ seq_no = marker->data[12];
+ dst_ptr = icc_data + data_offset[seq_no];
+ src_ptr = marker->data + ICC_OVERHEAD_LEN;
+ length = data_length[seq_no];
+ while (length--) {
+ *dst_ptr++ = *src_ptr++;
+ }
+ }
+ }
+
+ *icc_data_ptr = icc_data;
+ *icc_data_len = total_length;
+
+ return TRUE;
+}
diff --git a/media/libjpeg/jdinput.c b/media/libjpeg/jdinput.c
new file mode 100644
index 0000000000..1bc5aff1a7
--- /dev/null
+++ b/media/libjpeg/jdinput.c
@@ -0,0 +1,408 @@
+/*
+ * jdinput.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains input control logic for the JPEG decompressor.
+ * These routines are concerned with controlling the decompressor's input
+ * processing (marker reading and coefficient decoding). The actual input
+ * reading is done in jdmarker.c, jdhuff.c, and jdphuff.c.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jpegcomp.h"
+
+
+/* Private state */
+
+typedef struct {
+ struct jpeg_input_controller pub; /* public fields */
+
+ boolean inheaders; /* TRUE until first SOS is reached */
+} my_input_controller;
+
+typedef my_input_controller *my_inputctl_ptr;
+
+
+/* Forward declarations */
+METHODDEF(int) consume_markers(j_decompress_ptr cinfo);
+
+
+/*
+ * Routines to calculate various quantities related to the size of the image.
+ */
+
+LOCAL(void)
+initial_setup(j_decompress_ptr cinfo)
+/* Called once, when first SOS marker is reached */
+{
+ int ci;
+ jpeg_component_info *compptr;
+
+ /* Make sure image isn't bigger than I can handle */
+ if ((long)cinfo->image_height > (long)JPEG_MAX_DIMENSION ||
+ (long)cinfo->image_width > (long)JPEG_MAX_DIMENSION)
+ ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)JPEG_MAX_DIMENSION);
+
+ /* For now, precision must match compiled-in value... */
+ if (cinfo->data_precision != BITS_IN_JSAMPLE)
+ ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
+ /* Check that number of components won't exceed internal array sizes */
+ if (cinfo->num_components > MAX_COMPONENTS)
+ ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
+ MAX_COMPONENTS);
+
+ /* Compute maximum sampling factors; check factor validity */
+ cinfo->max_h_samp_factor = 1;
+ cinfo->max_v_samp_factor = 1;
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ if (compptr->h_samp_factor <= 0 ||
+ compptr->h_samp_factor > MAX_SAMP_FACTOR ||
+ compptr->v_samp_factor <= 0 ||
+ compptr->v_samp_factor > MAX_SAMP_FACTOR)
+ ERREXIT(cinfo, JERR_BAD_SAMPLING);
+ cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
+ compptr->h_samp_factor);
+ cinfo->max_v_samp_factor = MAX(cinfo->max_v_samp_factor,
+ compptr->v_samp_factor);
+ }
+
+#if JPEG_LIB_VERSION >= 80
+ cinfo->block_size = DCTSIZE;
+ cinfo->natural_order = jpeg_natural_order;
+ cinfo->lim_Se = DCTSIZE2 - 1;
+#endif
+
+ /* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE.
+ * In the full decompressor, this will be overridden by jdmaster.c;
+ * but in the transcoder, jdmaster.c is not used, so we must do it here.
+ */
+#if JPEG_LIB_VERSION >= 70
+ cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = DCTSIZE;
+#else
+ cinfo->min_DCT_scaled_size = DCTSIZE;
+#endif
+
+ /* Compute dimensions of components */
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+#if JPEG_LIB_VERSION >= 70
+ compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = DCTSIZE;
+#else
+ compptr->DCT_scaled_size = DCTSIZE;
+#endif
+ /* Size in DCT blocks */
+ compptr->width_in_blocks = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * (long)compptr->h_samp_factor,
+ (long)(cinfo->max_h_samp_factor * DCTSIZE));
+ compptr->height_in_blocks = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * (long)compptr->v_samp_factor,
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
+ /* Set the first and last MCU columns to decompress from multi-scan images.
+ * By default, decompress all of the MCU columns.
+ */
+ cinfo->master->first_MCU_col[ci] = 0;
+ cinfo->master->last_MCU_col[ci] = compptr->width_in_blocks - 1;
+ /* downsampled_width and downsampled_height will also be overridden by
+ * jdmaster.c if we are doing full decompression. The transcoder library
+ * doesn't use these values, but the calling application might.
+ */
+ /* Size in samples */
+ compptr->downsampled_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * (long)compptr->h_samp_factor,
+ (long)cinfo->max_h_samp_factor);
+ compptr->downsampled_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * (long)compptr->v_samp_factor,
+ (long)cinfo->max_v_samp_factor);
+ /* Mark component needed, until color conversion says otherwise */
+ compptr->component_needed = TRUE;
+ /* Mark no quantization table yet saved for component */
+ compptr->quant_table = NULL;
+ }
+
+ /* Compute number of fully interleaved MCU rows. */
+ cinfo->total_iMCU_rows = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height,
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
+
+ /* Decide whether file contains multiple scans */
+ if (cinfo->comps_in_scan < cinfo->num_components || cinfo->progressive_mode)
+ cinfo->inputctl->has_multiple_scans = TRUE;
+ else
+ cinfo->inputctl->has_multiple_scans = FALSE;
+}
+
+
+LOCAL(void)
+per_scan_setup(j_decompress_ptr cinfo)
+/* Do computations that are needed before processing a JPEG scan */
+/* cinfo->comps_in_scan and cinfo->cur_comp_info[] were set from SOS marker */
+{
+ int ci, mcublks, tmp;
+ jpeg_component_info *compptr;
+
+ if (cinfo->comps_in_scan == 1) {
+
+ /* Noninterleaved (single-component) scan */
+ compptr = cinfo->cur_comp_info[0];
+
+ /* Overall image size in MCUs */
+ cinfo->MCUs_per_row = compptr->width_in_blocks;
+ cinfo->MCU_rows_in_scan = compptr->height_in_blocks;
+
+ /* For noninterleaved scan, always one block per MCU */
+ compptr->MCU_width = 1;
+ compptr->MCU_height = 1;
+ compptr->MCU_blocks = 1;
+ compptr->MCU_sample_width = compptr->_DCT_scaled_size;
+ compptr->last_col_width = 1;
+ /* For noninterleaved scans, it is convenient to define last_row_height
+ * as the number of block rows present in the last iMCU row.
+ */
+ tmp = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
+ if (tmp == 0) tmp = compptr->v_samp_factor;
+ compptr->last_row_height = tmp;
+
+ /* Prepare array describing MCU composition */
+ cinfo->blocks_in_MCU = 1;
+ cinfo->MCU_membership[0] = 0;
+
+ } else {
+
+ /* Interleaved (multi-component) scan */
+ if (cinfo->comps_in_scan <= 0 || cinfo->comps_in_scan > MAX_COMPS_IN_SCAN)
+ ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->comps_in_scan,
+ MAX_COMPS_IN_SCAN);
+
+ /* Overall image size in MCUs */
+ cinfo->MCUs_per_row = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width,
+ (long)(cinfo->max_h_samp_factor * DCTSIZE));
+ cinfo->MCU_rows_in_scan = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height,
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
+
+ cinfo->blocks_in_MCU = 0;
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ /* Sampling factors give # of blocks of component in each MCU */
+ compptr->MCU_width = compptr->h_samp_factor;
+ compptr->MCU_height = compptr->v_samp_factor;
+ compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
+ compptr->MCU_sample_width = compptr->MCU_width *
+ compptr->_DCT_scaled_size;
+ /* Figure number of non-dummy blocks in last MCU column & row */
+ tmp = (int)(compptr->width_in_blocks % compptr->MCU_width);
+ if (tmp == 0) tmp = compptr->MCU_width;
+ compptr->last_col_width = tmp;
+ tmp = (int)(compptr->height_in_blocks % compptr->MCU_height);
+ if (tmp == 0) tmp = compptr->MCU_height;
+ compptr->last_row_height = tmp;
+ /* Prepare array describing MCU composition */
+ mcublks = compptr->MCU_blocks;
+ if (cinfo->blocks_in_MCU + mcublks > D_MAX_BLOCKS_IN_MCU)
+ ERREXIT(cinfo, JERR_BAD_MCU_SIZE);
+ while (mcublks-- > 0) {
+ cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci;
+ }
+ }
+
+ }
+}
+
+
+/*
+ * Save away a copy of the Q-table referenced by each component present
+ * in the current scan, unless already saved during a prior scan.
+ *
+ * In a multiple-scan JPEG file, the encoder could assign different components
+ * the same Q-table slot number, but change table definitions between scans
+ * so that each component uses a different Q-table. (The IJG encoder is not
+ * currently capable of doing this, but other encoders might.) Since we want
+ * to be able to dequantize all the components at the end of the file, this
+ * means that we have to save away the table actually used for each component.
+ * We do this by copying the table at the start of the first scan containing
+ * the component.
+ * Rec. ITU-T T.81 | ISO/IEC 10918-1 prohibits the encoder from changing the
+ * contents of a Q-table slot between scans of a component using that slot. If
+ * the encoder does so anyway, this decoder will simply use the Q-table values
+ * that were current at the start of the first scan for the component.
+ *
+ * The decompressor output side looks only at the saved quant tables,
+ * not at the current Q-table slots.
+ */
+
+LOCAL(void)
+latch_quant_tables(j_decompress_ptr cinfo)
+{
+ int ci, qtblno;
+ jpeg_component_info *compptr;
+ JQUANT_TBL *qtbl;
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ /* No work if we already saved Q-table for this component */
+ if (compptr->quant_table != NULL)
+ continue;
+ /* Make sure specified quantization table is present */
+ qtblno = compptr->quant_tbl_no;
+ if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
+ cinfo->quant_tbl_ptrs[qtblno] == NULL)
+ ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
+ /* OK, save away the quantization table */
+ qtbl = (JQUANT_TBL *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(JQUANT_TBL));
+ memcpy(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
+ compptr->quant_table = qtbl;
+ }
+}
+
+
+/*
+ * Initialize the input modules to read a scan of compressed data.
+ * The first call to this is done by jdmaster.c after initializing
+ * the entire decompressor (during jpeg_start_decompress).
+ * Subsequent calls come from consume_markers, below.
+ */
+
+METHODDEF(void)
+start_input_pass(j_decompress_ptr cinfo)
+{
+ per_scan_setup(cinfo);
+ latch_quant_tables(cinfo);
+ (*cinfo->entropy->start_pass) (cinfo);
+ (*cinfo->coef->start_input_pass) (cinfo);
+ cinfo->inputctl->consume_input = cinfo->coef->consume_data;
+}
+
+
+/*
+ * Finish up after inputting a compressed-data scan.
+ * This is called by the coefficient controller after it's read all
+ * the expected data of the scan.
+ */
+
+METHODDEF(void)
+finish_input_pass(j_decompress_ptr cinfo)
+{
+ cinfo->inputctl->consume_input = consume_markers;
+}
+
+
+/*
+ * Read JPEG markers before, between, or after compressed-data scans.
+ * Change state as necessary when a new scan is reached.
+ * Return value is JPEG_SUSPENDED, JPEG_REACHED_SOS, or JPEG_REACHED_EOI.
+ *
+ * The consume_input method pointer points either here or to the
+ * coefficient controller's consume_data routine, depending on whether
+ * we are reading a compressed data segment or inter-segment markers.
+ */
+
+METHODDEF(int)
+consume_markers(j_decompress_ptr cinfo)
+{
+ my_inputctl_ptr inputctl = (my_inputctl_ptr)cinfo->inputctl;
+ int val;
+
+ if (inputctl->pub.eoi_reached) /* After hitting EOI, read no further */
+ return JPEG_REACHED_EOI;
+
+ val = (*cinfo->marker->read_markers) (cinfo);
+
+ switch (val) {
+ case JPEG_REACHED_SOS: /* Found SOS */
+ if (inputctl->inheaders) { /* 1st SOS */
+ initial_setup(cinfo);
+ inputctl->inheaders = FALSE;
+ /* Note: start_input_pass must be called by jdmaster.c
+ * before any more input can be consumed. jdapimin.c is
+ * responsible for enforcing this sequencing.
+ */
+ } else { /* 2nd or later SOS marker */
+ if (!inputctl->pub.has_multiple_scans)
+ ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */
+ start_input_pass(cinfo);
+ }
+ break;
+ case JPEG_REACHED_EOI: /* Found EOI */
+ inputctl->pub.eoi_reached = TRUE;
+ if (inputctl->inheaders) { /* Tables-only datastream, apparently */
+ if (cinfo->marker->saw_SOF)
+ ERREXIT(cinfo, JERR_SOF_NO_SOS);
+ } else {
+ /* Prevent infinite loop in coef ctlr's decompress_data routine
+ * if user set output_scan_number larger than number of scans.
+ */
+ if (cinfo->output_scan_number > cinfo->input_scan_number)
+ cinfo->output_scan_number = cinfo->input_scan_number;
+ }
+ break;
+ case JPEG_SUSPENDED:
+ break;
+ }
+
+ return val;
+}
+
+
+/*
+ * Reset state to begin a fresh datastream.
+ */
+
+METHODDEF(void)
+reset_input_controller(j_decompress_ptr cinfo)
+{
+ my_inputctl_ptr inputctl = (my_inputctl_ptr)cinfo->inputctl;
+
+ inputctl->pub.consume_input = consume_markers;
+ inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */
+ inputctl->pub.eoi_reached = FALSE;
+ inputctl->inheaders = TRUE;
+ /* Reset other modules */
+ (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
+ (*cinfo->marker->reset_marker_reader) (cinfo);
+ /* Reset progression state -- would be cleaner if entropy decoder did this */
+ cinfo->coef_bits = NULL;
+}
+
+
+/*
+ * Initialize the input controller module.
+ * This is called only once, when the decompression object is created.
+ */
+
+GLOBAL(void)
+jinit_input_controller(j_decompress_ptr cinfo)
+{
+ my_inputctl_ptr inputctl;
+
+ /* Create subobject in permanent pool */
+ inputctl = (my_inputctl_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+ sizeof(my_input_controller));
+ cinfo->inputctl = (struct jpeg_input_controller *)inputctl;
+ /* Initialize method pointers */
+ inputctl->pub.consume_input = consume_markers;
+ inputctl->pub.reset_input_controller = reset_input_controller;
+ inputctl->pub.start_input_pass = start_input_pass;
+ inputctl->pub.finish_input_pass = finish_input_pass;
+ /* Initialize state: can't use reset_input_controller since we don't
+ * want to try to reset other modules yet.
+ */
+ inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */
+ inputctl->pub.eoi_reached = FALSE;
+ inputctl->inheaders = TRUE;
+}
diff --git a/media/libjpeg/jdmainct.c b/media/libjpeg/jdmainct.c
new file mode 100644
index 0000000000..d332e6b2fa
--- /dev/null
+++ b/media/libjpeg/jdmainct.c
@@ -0,0 +1,460 @@
+/*
+ * jdmainct.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010, 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the main buffer controller for decompression.
+ * The main buffer lies between the JPEG decompressor proper and the
+ * post-processor; it holds downsampled data in the JPEG colorspace.
+ *
+ * Note that this code is bypassed in raw-data mode, since the application
+ * supplies the equivalent of the main buffer in that case.
+ */
+
+#include "jinclude.h"
+#include "jdmainct.h"
+
+
+/*
+ * In the current system design, the main buffer need never be a full-image
+ * buffer; any full-height buffers will be found inside the coefficient or
+ * postprocessing controllers. Nonetheless, the main controller is not
+ * trivial. Its responsibility is to provide context rows for upsampling/
+ * rescaling, and doing this in an efficient fashion is a bit tricky.
+ *
+ * Postprocessor input data is counted in "row groups". A row group
+ * is defined to be (v_samp_factor * DCT_scaled_size / min_DCT_scaled_size)
+ * sample rows of each component. (We require DCT_scaled_size values to be
+ * chosen such that these numbers are integers. In practice DCT_scaled_size
+ * values will likely be powers of two, so we actually have the stronger
+ * condition that DCT_scaled_size / min_DCT_scaled_size is an integer.)
+ * Upsampling will typically produce max_v_samp_factor pixel rows from each
+ * row group (times any additional scale factor that the upsampler is
+ * applying).
+ *
+ * The coefficient controller will deliver data to us one iMCU row at a time;
+ * each iMCU row contains v_samp_factor * DCT_scaled_size sample rows, or
+ * exactly min_DCT_scaled_size row groups. (This amount of data corresponds
+ * to one row of MCUs when the image is fully interleaved.) Note that the
+ * number of sample rows varies across components, but the number of row
+ * groups does not. Some garbage sample rows may be included in the last iMCU
+ * row at the bottom of the image.
+ *
+ * Depending on the vertical scaling algorithm used, the upsampler may need
+ * access to the sample row(s) above and below its current input row group.
+ * The upsampler is required to set need_context_rows TRUE at global selection
+ * time if so. When need_context_rows is FALSE, this controller can simply
+ * obtain one iMCU row at a time from the coefficient controller and dole it
+ * out as row groups to the postprocessor.
+ *
+ * When need_context_rows is TRUE, this controller guarantees that the buffer
+ * passed to postprocessing contains at least one row group's worth of samples
+ * above and below the row group(s) being processed. Note that the context
+ * rows "above" the first passed row group appear at negative row offsets in
+ * the passed buffer. At the top and bottom of the image, the required
+ * context rows are manufactured by duplicating the first or last real sample
+ * row; this avoids having special cases in the upsampling inner loops.
+ *
+ * The amount of context is fixed at one row group just because that's a
+ * convenient number for this controller to work with. The existing
+ * upsamplers really only need one sample row of context. An upsampler
+ * supporting arbitrary output rescaling might wish for more than one row
+ * group of context when shrinking the image; tough, we don't handle that.
+ * (This is justified by the assumption that downsizing will be handled mostly
+ * by adjusting the DCT_scaled_size values, so that the actual scale factor at
+ * the upsample step needn't be much less than one.)
+ *
+ * To provide the desired context, we have to retain the last two row groups
+ * of one iMCU row while reading in the next iMCU row. (The last row group
+ * can't be processed until we have another row group for its below-context,
+ * and so we have to save the next-to-last group too for its above-context.)
+ * We could do this most simply by copying data around in our buffer, but
+ * that'd be very slow. We can avoid copying any data by creating a rather
+ * strange pointer structure. Here's how it works. We allocate a workspace
+ * consisting of M+2 row groups (where M = min_DCT_scaled_size is the number
+ * of row groups per iMCU row). We create two sets of redundant pointers to
+ * the workspace. Labeling the physical row groups 0 to M+1, the synthesized
+ * pointer lists look like this:
+ * M+1 M-1
+ * master pointer --> 0 master pointer --> 0
+ * 1 1
+ * ... ...
+ * M-3 M-3
+ * M-2 M
+ * M-1 M+1
+ * M M-2
+ * M+1 M-1
+ * 0 0
+ * We read alternate iMCU rows using each master pointer; thus the last two
+ * row groups of the previous iMCU row remain un-overwritten in the workspace.
+ * The pointer lists are set up so that the required context rows appear to
+ * be adjacent to the proper places when we pass the pointer lists to the
+ * upsampler.
+ *
+ * The above pictures describe the normal state of the pointer lists.
+ * At top and bottom of the image, we diddle the pointer lists to duplicate
+ * the first or last sample row as necessary (this is cheaper than copying
+ * sample rows around).
+ *
+ * This scheme breaks down if M < 2, ie, min_DCT_scaled_size is 1. In that
+ * situation each iMCU row provides only one row group so the buffering logic
+ * must be different (eg, we must read two iMCU rows before we can emit the
+ * first row group). For now, we simply do not support providing context
+ * rows when min_DCT_scaled_size is 1. That combination seems unlikely to
+ * be worth providing --- if someone wants a 1/8th-size preview, they probably
+ * want it quick and dirty, so a context-free upsampler is sufficient.
+ */
+
+
+/* Forward declarations */
+METHODDEF(void) process_data_simple_main(j_decompress_ptr cinfo,
+ JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_context_main(j_decompress_ptr cinfo,
+ JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
+#ifdef QUANT_2PASS_SUPPORTED
+METHODDEF(void) process_data_crank_post(j_decompress_ptr cinfo,
+ JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
+#endif
+
+
+LOCAL(void)
+alloc_funny_pointers(j_decompress_ptr cinfo)
+/* Allocate space for the funny pointer lists.
+ * This is done only once, not once per pass.
+ */
+{
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+ int ci, rgroup;
+ int M = cinfo->_min_DCT_scaled_size;
+ jpeg_component_info *compptr;
+ JSAMPARRAY xbuf;
+
+ /* Get top-level space for component array pointers.
+ * We alloc both arrays with one call to save a few cycles.
+ */
+ main_ptr->xbuffer[0] = (JSAMPIMAGE)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ cinfo->num_components * 2 * sizeof(JSAMPARRAY));
+ main_ptr->xbuffer[1] = main_ptr->xbuffer[0] + cinfo->num_components;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+ cinfo->_min_DCT_scaled_size; /* height of a row group of component */
+ /* Get space for pointer lists --- M+4 row groups in each list.
+ * We alloc both pointer lists with one call to save a few cycles.
+ */
+ xbuf = (JSAMPARRAY)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ 2 * (rgroup * (M + 4)) * sizeof(JSAMPROW));
+ xbuf += rgroup; /* want one row group at negative offsets */
+ main_ptr->xbuffer[0][ci] = xbuf;
+ xbuf += rgroup * (M + 4);
+ main_ptr->xbuffer[1][ci] = xbuf;
+ }
+}
+
+
+LOCAL(void)
+make_funny_pointers(j_decompress_ptr cinfo)
+/* Create the funny pointer lists discussed in the comments above.
+ * The actual workspace is already allocated (in main_ptr->buffer),
+ * and the space for the pointer lists is allocated too.
+ * This routine just fills in the curiously ordered lists.
+ * This will be repeated at the beginning of each pass.
+ */
+{
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+ int ci, i, rgroup;
+ int M = cinfo->_min_DCT_scaled_size;
+ jpeg_component_info *compptr;
+ JSAMPARRAY buf, xbuf0, xbuf1;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+ cinfo->_min_DCT_scaled_size; /* height of a row group of component */
+ xbuf0 = main_ptr->xbuffer[0][ci];
+ xbuf1 = main_ptr->xbuffer[1][ci];
+ /* First copy the workspace pointers as-is */
+ buf = main_ptr->buffer[ci];
+ for (i = 0; i < rgroup * (M + 2); i++) {
+ xbuf0[i] = xbuf1[i] = buf[i];
+ }
+ /* In the second list, put the last four row groups in swapped order */
+ for (i = 0; i < rgroup * 2; i++) {
+ xbuf1[rgroup * (M - 2) + i] = buf[rgroup * M + i];
+ xbuf1[rgroup * M + i] = buf[rgroup * (M - 2) + i];
+ }
+ /* The wraparound pointers at top and bottom will be filled later
+ * (see set_wraparound_pointers, below). Initially we want the "above"
+ * pointers to duplicate the first actual data line. This only needs
+ * to happen in xbuffer[0].
+ */
+ for (i = 0; i < rgroup; i++) {
+ xbuf0[i - rgroup] = xbuf0[0];
+ }
+ }
+}
+
+
+LOCAL(void)
+set_bottom_pointers(j_decompress_ptr cinfo)
+/* Change the pointer lists to duplicate the last sample row at the bottom
+ * of the image. whichptr indicates which xbuffer holds the final iMCU row.
+ * Also sets rowgroups_avail to indicate number of nondummy row groups in row.
+ */
+{
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+ int ci, i, rgroup, iMCUheight, rows_left;
+ jpeg_component_info *compptr;
+ JSAMPARRAY xbuf;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ /* Count sample rows in one iMCU row and in one row group */
+ iMCUheight = compptr->v_samp_factor * compptr->_DCT_scaled_size;
+ rgroup = iMCUheight / cinfo->_min_DCT_scaled_size;
+ /* Count nondummy sample rows remaining for this component */
+ rows_left = (int)(compptr->downsampled_height % (JDIMENSION)iMCUheight);
+ if (rows_left == 0) rows_left = iMCUheight;
+ /* Count nondummy row groups. Should get same answer for each component,
+ * so we need only do it once.
+ */
+ if (ci == 0) {
+ main_ptr->rowgroups_avail = (JDIMENSION)((rows_left - 1) / rgroup + 1);
+ }
+ /* Duplicate the last real sample row rgroup*2 times; this pads out the
+ * last partial rowgroup and ensures at least one full rowgroup of context.
+ */
+ xbuf = main_ptr->xbuffer[main_ptr->whichptr][ci];
+ for (i = 0; i < rgroup * 2; i++) {
+ xbuf[rows_left + i] = xbuf[rows_left - 1];
+ }
+ }
+}
+
+
+/*
+ * Initialize for a processing pass.
+ */
+
+METHODDEF(void)
+start_pass_main(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
+{
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+
+ switch (pass_mode) {
+ case JBUF_PASS_THRU:
+ if (cinfo->upsample->need_context_rows) {
+ main_ptr->pub.process_data = process_data_context_main;
+ make_funny_pointers(cinfo); /* Create the xbuffer[] lists */
+ main_ptr->whichptr = 0; /* Read first iMCU row into xbuffer[0] */
+ main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
+ main_ptr->iMCU_row_ctr = 0;
+ } else {
+ /* Simple case with no context needed */
+ main_ptr->pub.process_data = process_data_simple_main;
+ }
+ main_ptr->buffer_full = FALSE; /* Mark buffer empty */
+ main_ptr->rowgroup_ctr = 0;
+ break;
+#ifdef QUANT_2PASS_SUPPORTED
+ case JBUF_CRANK_DEST:
+ /* For last pass of 2-pass quantization, just crank the postprocessor */
+ main_ptr->pub.process_data = process_data_crank_post;
+ break;
+#endif
+ default:
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+ break;
+ }
+}
+
+
+/*
+ * Process some data.
+ * This handles the simple case where no context is required.
+ */
+
+METHODDEF(void)
+process_data_simple_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
+{
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+ JDIMENSION rowgroups_avail;
+
+ /* Read input data if we haven't filled the main buffer yet */
+ if (!main_ptr->buffer_full) {
+ if (!(*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
+ return; /* suspension forced, can do nothing more */
+ main_ptr->buffer_full = TRUE; /* OK, we have an iMCU row to work with */
+ }
+
+ /* There are always min_DCT_scaled_size row groups in an iMCU row. */
+ rowgroups_avail = (JDIMENSION)cinfo->_min_DCT_scaled_size;
+ /* Note: at the bottom of the image, we may pass extra garbage row groups
+ * to the postprocessor. The postprocessor has to check for bottom
+ * of image anyway (at row resolution), so no point in us doing it too.
+ */
+
+ /* Feed the postprocessor */
+ (*cinfo->post->post_process_data) (cinfo, main_ptr->buffer,
+ &main_ptr->rowgroup_ctr, rowgroups_avail,
+ output_buf, out_row_ctr, out_rows_avail);
+
+ /* Has postprocessor consumed all the data yet? If so, mark buffer empty */
+ if (main_ptr->rowgroup_ctr >= rowgroups_avail) {
+ main_ptr->buffer_full = FALSE;
+ main_ptr->rowgroup_ctr = 0;
+ }
+}
+
+
+/*
+ * Process some data.
+ * This handles the case where context rows must be provided.
+ */
+
+METHODDEF(void)
+process_data_context_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
+{
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+
+ /* Read input data if we haven't filled the main buffer yet */
+ if (!main_ptr->buffer_full) {
+ if (!(*cinfo->coef->decompress_data) (cinfo,
+ main_ptr->xbuffer[main_ptr->whichptr]))
+ return; /* suspension forced, can do nothing more */
+ main_ptr->buffer_full = TRUE; /* OK, we have an iMCU row to work with */
+ main_ptr->iMCU_row_ctr++; /* count rows received */
+ }
+
+ /* Postprocessor typically will not swallow all the input data it is handed
+ * in one call (due to filling the output buffer first). Must be prepared
+ * to exit and restart. This switch lets us keep track of how far we got.
+ * Note that each case falls through to the next on successful completion.
+ */
+ switch (main_ptr->context_state) {
+ case CTX_POSTPONED_ROW:
+ /* Call postprocessor using previously set pointers for postponed row */
+ (*cinfo->post->post_process_data) (cinfo,
+ main_ptr->xbuffer[main_ptr->whichptr],
+ &main_ptr->rowgroup_ctr,
+ main_ptr->rowgroups_avail, output_buf,
+ out_row_ctr, out_rows_avail);
+ if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
+ return; /* Need to suspend */
+ main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
+ if (*out_row_ctr >= out_rows_avail)
+ return; /* Postprocessor exactly filled output buf */
+ FALLTHROUGH /*FALLTHROUGH*/
+ case CTX_PREPARE_FOR_IMCU:
+ /* Prepare to process first M-1 row groups of this iMCU row */
+ main_ptr->rowgroup_ctr = 0;
+ main_ptr->rowgroups_avail = (JDIMENSION)(cinfo->_min_DCT_scaled_size - 1);
+ /* Check for bottom of image: if so, tweak pointers to "duplicate"
+ * the last sample row, and adjust rowgroups_avail to ignore padding rows.
+ */
+ if (main_ptr->iMCU_row_ctr == cinfo->total_iMCU_rows)
+ set_bottom_pointers(cinfo);
+ main_ptr->context_state = CTX_PROCESS_IMCU;
+ FALLTHROUGH /*FALLTHROUGH*/
+ case CTX_PROCESS_IMCU:
+ /* Call postprocessor using previously set pointers */
+ (*cinfo->post->post_process_data) (cinfo,
+ main_ptr->xbuffer[main_ptr->whichptr],
+ &main_ptr->rowgroup_ctr,
+ main_ptr->rowgroups_avail, output_buf,
+ out_row_ctr, out_rows_avail);
+ if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
+ return; /* Need to suspend */
+ /* After the first iMCU, change wraparound pointers to normal state */
+ if (main_ptr->iMCU_row_ctr == 1)
+ set_wraparound_pointers(cinfo);
+ /* Prepare to load new iMCU row using other xbuffer list */
+ main_ptr->whichptr ^= 1; /* 0=>1 or 1=>0 */
+ main_ptr->buffer_full = FALSE;
+ /* Still need to process last row group of this iMCU row, */
+ /* which is saved at index M+1 of the other xbuffer */
+ main_ptr->rowgroup_ctr = (JDIMENSION)(cinfo->_min_DCT_scaled_size + 1);
+ main_ptr->rowgroups_avail = (JDIMENSION)(cinfo->_min_DCT_scaled_size + 2);
+ main_ptr->context_state = CTX_POSTPONED_ROW;
+ }
+}
+
+
+/*
+ * Process some data.
+ * Final pass of two-pass quantization: just call the postprocessor.
+ * Source data will be the postprocessor controller's internal buffer.
+ */
+
+#ifdef QUANT_2PASS_SUPPORTED
+
+METHODDEF(void)
+process_data_crank_post(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
+{
+ (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE)NULL,
+ (JDIMENSION *)NULL, (JDIMENSION)0,
+ output_buf, out_row_ctr, out_rows_avail);
+}
+
+#endif /* QUANT_2PASS_SUPPORTED */
+
+
+/*
+ * Initialize main buffer controller.
+ */
+
+GLOBAL(void)
+jinit_d_main_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
+{
+ my_main_ptr main_ptr;
+ int ci, rgroup, ngroups;
+ jpeg_component_info *compptr;
+
+ main_ptr = (my_main_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_main_controller));
+ cinfo->main = (struct jpeg_d_main_controller *)main_ptr;
+ main_ptr->pub.start_pass = start_pass_main;
+
+ if (need_full_buffer) /* shouldn't happen */
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+
+ /* Allocate the workspace.
+ * ngroups is the number of row groups we need.
+ */
+ if (cinfo->upsample->need_context_rows) {
+ if (cinfo->_min_DCT_scaled_size < 2) /* unsupported, see comments above */
+ ERREXIT(cinfo, JERR_NOTIMPL);
+ alloc_funny_pointers(cinfo); /* Alloc space for xbuffer[] lists */
+ ngroups = cinfo->_min_DCT_scaled_size + 2;
+ } else {
+ ngroups = cinfo->_min_DCT_scaled_size;
+ }
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+ cinfo->_min_DCT_scaled_size; /* height of a row group of component */
+ main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ compptr->width_in_blocks * compptr->_DCT_scaled_size,
+ (JDIMENSION)(rgroup * ngroups));
+ }
+}
diff --git a/media/libjpeg/jdmainct.h b/media/libjpeg/jdmainct.h
new file mode 100644
index 0000000000..37b201ca88
--- /dev/null
+++ b/media/libjpeg/jdmainct.h
@@ -0,0 +1,71 @@
+/*
+ * jdmainct.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#define JPEG_INTERNALS
+#include "jpeglib.h"
+#include "jpegcomp.h"
+
+
+/* Private buffer controller object */
+
+typedef struct {
+ struct jpeg_d_main_controller pub; /* public fields */
+
+ /* Pointer to allocated workspace (M or M+2 row groups). */
+ JSAMPARRAY buffer[MAX_COMPONENTS];
+
+ boolean buffer_full; /* Have we gotten an iMCU row from decoder? */
+ JDIMENSION rowgroup_ctr; /* counts row groups output to postprocessor */
+
+ /* Remaining fields are only used in the context case. */
+
+ /* These are the master pointers to the funny-order pointer lists. */
+ JSAMPIMAGE xbuffer[2]; /* pointers to weird pointer lists */
+
+ int whichptr; /* indicates which pointer set is now in use */
+ int context_state; /* process_data state machine status */
+ JDIMENSION rowgroups_avail; /* row groups available to postprocessor */
+ JDIMENSION iMCU_row_ctr; /* counts iMCU rows to detect image top/bot */
+} my_main_controller;
+
+typedef my_main_controller *my_main_ptr;
+
+
+/* context_state values: */
+#define CTX_PREPARE_FOR_IMCU 0 /* need to prepare for MCU row */
+#define CTX_PROCESS_IMCU 1 /* feeding iMCU to postprocessor */
+#define CTX_POSTPONED_ROW 2 /* feeding postponed row group */
+
+
+LOCAL(void)
+set_wraparound_pointers(j_decompress_ptr cinfo)
+/* Set up the "wraparound" pointers at top and bottom of the pointer lists.
+ * This changes the pointer list state from top-of-image to the normal state.
+ */
+{
+ my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+ int ci, i, rgroup;
+ int M = cinfo->_min_DCT_scaled_size;
+ jpeg_component_info *compptr;
+ JSAMPARRAY xbuf0, xbuf1;
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+ cinfo->_min_DCT_scaled_size; /* height of a row group of component */
+ xbuf0 = main_ptr->xbuffer[0][ci];
+ xbuf1 = main_ptr->xbuffer[1][ci];
+ for (i = 0; i < rgroup; i++) {
+ xbuf0[i - rgroup] = xbuf0[rgroup * (M + 1) + i];
+ xbuf1[i - rgroup] = xbuf1[rgroup * (M + 1) + i];
+ xbuf0[rgroup * (M + 2) + i] = xbuf0[i];
+ xbuf1[rgroup * (M + 2) + i] = xbuf1[i];
+ }
+ }
+}
diff --git a/media/libjpeg/jdmarker.c b/media/libjpeg/jdmarker.c
new file mode 100644
index 0000000000..f7eba615fd
--- /dev/null
+++ b/media/libjpeg/jdmarker.c
@@ -0,0 +1,1374 @@
+/*
+ * jdmarker.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2012, 2015, 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains routines to decode JPEG datastream markers.
+ * Most of the complexity arises from our desire to support input
+ * suspension: if not all of the data for a marker is available,
+ * we must exit back to the application. On resumption, we reprocess
+ * the marker.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+typedef enum { /* JPEG marker codes */
+ M_SOF0 = 0xc0,
+ M_SOF1 = 0xc1,
+ M_SOF2 = 0xc2,
+ M_SOF3 = 0xc3,
+
+ M_SOF5 = 0xc5,
+ M_SOF6 = 0xc6,
+ M_SOF7 = 0xc7,
+
+ M_JPG = 0xc8,
+ M_SOF9 = 0xc9,
+ M_SOF10 = 0xca,
+ M_SOF11 = 0xcb,
+
+ M_SOF13 = 0xcd,
+ M_SOF14 = 0xce,
+ M_SOF15 = 0xcf,
+
+ M_DHT = 0xc4,
+
+ M_DAC = 0xcc,
+
+ M_RST0 = 0xd0,
+ M_RST1 = 0xd1,
+ M_RST2 = 0xd2,
+ M_RST3 = 0xd3,
+ M_RST4 = 0xd4,
+ M_RST5 = 0xd5,
+ M_RST6 = 0xd6,
+ M_RST7 = 0xd7,
+
+ M_SOI = 0xd8,
+ M_EOI = 0xd9,
+ M_SOS = 0xda,
+ M_DQT = 0xdb,
+ M_DNL = 0xdc,
+ M_DRI = 0xdd,
+ M_DHP = 0xde,
+ M_EXP = 0xdf,
+
+ M_APP0 = 0xe0,
+ M_APP1 = 0xe1,
+ M_APP2 = 0xe2,
+ M_APP3 = 0xe3,
+ M_APP4 = 0xe4,
+ M_APP5 = 0xe5,
+ M_APP6 = 0xe6,
+ M_APP7 = 0xe7,
+ M_APP8 = 0xe8,
+ M_APP9 = 0xe9,
+ M_APP10 = 0xea,
+ M_APP11 = 0xeb,
+ M_APP12 = 0xec,
+ M_APP13 = 0xed,
+ M_APP14 = 0xee,
+ M_APP15 = 0xef,
+
+ M_JPG0 = 0xf0,
+ M_JPG13 = 0xfd,
+ M_COM = 0xfe,
+
+ M_TEM = 0x01,
+
+ M_ERROR = 0x100
+} JPEG_MARKER;
+
+
+/* Private state */
+
+typedef struct {
+ struct jpeg_marker_reader pub; /* public fields */
+
+ /* Application-overridable marker processing methods */
+ jpeg_marker_parser_method process_COM;
+ jpeg_marker_parser_method process_APPn[16];
+
+ /* Limit on marker data length to save for each marker type */
+ unsigned int length_limit_COM;
+ unsigned int length_limit_APPn[16];
+
+ /* Status of COM/APPn marker saving */
+ jpeg_saved_marker_ptr cur_marker; /* NULL if not processing a marker */
+ unsigned int bytes_read; /* data bytes read so far in marker */
+ /* Note: cur_marker is not linked into marker_list until it's all read. */
+} my_marker_reader;
+
+typedef my_marker_reader *my_marker_ptr;
+
+
+/*
+ * Macros for fetching data from the data source module.
+ *
+ * At all times, cinfo->src->next_input_byte and ->bytes_in_buffer reflect
+ * the current restart point; we update them only when we have reached a
+ * suitable place to restart if a suspension occurs.
+ */
+
+/* Declare and initialize local copies of input pointer/count */
+#define INPUT_VARS(cinfo) \
+ struct jpeg_source_mgr *datasrc = (cinfo)->src; \
+ const JOCTET *next_input_byte = datasrc->next_input_byte; \
+ size_t bytes_in_buffer = datasrc->bytes_in_buffer
+
+/* Unload the local copies --- do this only at a restart boundary */
+#define INPUT_SYNC(cinfo) \
+ ( datasrc->next_input_byte = next_input_byte, \
+ datasrc->bytes_in_buffer = bytes_in_buffer )
+
+/* Reload the local copies --- used only in MAKE_BYTE_AVAIL */
+#define INPUT_RELOAD(cinfo) \
+ ( next_input_byte = datasrc->next_input_byte, \
+ bytes_in_buffer = datasrc->bytes_in_buffer )
+
+/* Internal macro for INPUT_BYTE and INPUT_2BYTES: make a byte available.
+ * Note we do *not* do INPUT_SYNC before calling fill_input_buffer,
+ * but we must reload the local copies after a successful fill.
+ */
+#define MAKE_BYTE_AVAIL(cinfo, action) \
+ if (bytes_in_buffer == 0) { \
+ if (!(*datasrc->fill_input_buffer) (cinfo)) \
+ { action; } \
+ INPUT_RELOAD(cinfo); \
+ }
+
+/* Read a byte into variable V.
+ * If must suspend, take the specified action (typically "return FALSE").
+ */
+#define INPUT_BYTE(cinfo, V, action) \
+ MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
+ bytes_in_buffer--; \
+ V = *next_input_byte++; )
+
+/* As above, but read two bytes interpreted as an unsigned 16-bit integer.
+ * V should be declared unsigned int or perhaps JLONG.
+ */
+#define INPUT_2BYTES(cinfo, V, action) \
+ MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
+ bytes_in_buffer--; \
+ V = ((unsigned int)(*next_input_byte++)) << 8; \
+ MAKE_BYTE_AVAIL(cinfo, action); \
+ bytes_in_buffer--; \
+ V += *next_input_byte++; )
+
+
+/*
+ * Routines to process JPEG markers.
+ *
+ * Entry condition: JPEG marker itself has been read and its code saved
+ * in cinfo->unread_marker; input restart point is just after the marker.
+ *
+ * Exit: if return TRUE, have read and processed any parameters, and have
+ * updated the restart point to point after the parameters.
+ * If return FALSE, was forced to suspend before reaching end of
+ * marker parameters; restart point has not been moved. Same routine
+ * will be called again after application supplies more input data.
+ *
+ * This approach to suspension assumes that all of a marker's parameters
+ * can fit into a single input bufferload. This should hold for "normal"
+ * markers. Some COM/APPn markers might have large parameter segments
+ * that might not fit. If we are simply dropping such a marker, we use
+ * skip_input_data to get past it, and thereby put the problem on the
+ * source manager's shoulders. If we are saving the marker's contents
+ * into memory, we use a slightly different convention: when forced to
+ * suspend, the marker processor updates the restart point to the end of
+ * what it's consumed (ie, the end of the buffer) before returning FALSE.
+ * On resumption, cinfo->unread_marker still contains the marker code,
+ * but the data source will point to the next chunk of marker data.
+ * The marker processor must retain internal state to deal with this.
+ *
+ * Note that we don't bother to avoid duplicate trace messages if a
+ * suspension occurs within marker parameters. Other side effects
+ * require more care.
+ */
+
+
+LOCAL(boolean)
+get_soi(j_decompress_ptr cinfo)
+/* Process an SOI marker */
+{
+ int i;
+
+ TRACEMS(cinfo, 1, JTRC_SOI);
+
+ if (cinfo->marker->saw_SOI)
+ ERREXIT(cinfo, JERR_SOI_DUPLICATE);
+
+ /* Reset all parameters that are defined to be reset by SOI */
+
+ for (i = 0; i < NUM_ARITH_TBLS; i++) {
+ cinfo->arith_dc_L[i] = 0;
+ cinfo->arith_dc_U[i] = 1;
+ cinfo->arith_ac_K[i] = 5;
+ }
+ cinfo->restart_interval = 0;
+
+ /* Set initial assumptions for colorspace etc */
+
+ cinfo->jpeg_color_space = JCS_UNKNOWN;
+ cinfo->CCIR601_sampling = FALSE; /* Assume non-CCIR sampling??? */
+
+ cinfo->saw_JFIF_marker = FALSE;
+ cinfo->JFIF_major_version = 1; /* set default JFIF APP0 values */
+ cinfo->JFIF_minor_version = 1;
+ cinfo->density_unit = 0;
+ cinfo->X_density = 1;
+ cinfo->Y_density = 1;
+ cinfo->saw_Adobe_marker = FALSE;
+ cinfo->Adobe_transform = 0;
+
+ cinfo->marker->saw_SOI = TRUE;
+
+ return TRUE;
+}
+
+
+LOCAL(boolean)
+get_sof(j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
+/* Process a SOFn marker */
+{
+ JLONG length;
+ int c, ci;
+ jpeg_component_info *compptr;
+ INPUT_VARS(cinfo);
+
+ cinfo->progressive_mode = is_prog;
+ cinfo->arith_code = is_arith;
+
+ INPUT_2BYTES(cinfo, length, return FALSE);
+
+ INPUT_BYTE(cinfo, cinfo->data_precision, return FALSE);
+ INPUT_2BYTES(cinfo, cinfo->image_height, return FALSE);
+ INPUT_2BYTES(cinfo, cinfo->image_width, return FALSE);
+ INPUT_BYTE(cinfo, cinfo->num_components, return FALSE);
+
+ length -= 8;
+
+ TRACEMS4(cinfo, 1, JTRC_SOF, cinfo->unread_marker,
+ (int)cinfo->image_width, (int)cinfo->image_height,
+ cinfo->num_components);
+
+ if (cinfo->marker->saw_SOF)
+ ERREXIT(cinfo, JERR_SOF_DUPLICATE);
+
+ /* We don't support files in which the image height is initially specified */
+ /* as 0 and is later redefined by DNL. As long as we have to check that, */
+ /* might as well have a general sanity check. */
+ if (cinfo->image_height <= 0 || cinfo->image_width <= 0 ||
+ cinfo->num_components <= 0)
+ ERREXIT(cinfo, JERR_EMPTY_IMAGE);
+
+ if (length != (cinfo->num_components * 3))
+ ERREXIT(cinfo, JERR_BAD_LENGTH);
+
+ if (cinfo->comp_info == NULL) /* do only once, even if suspend */
+ cinfo->comp_info = (jpeg_component_info *)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ cinfo->num_components * sizeof(jpeg_component_info));
+
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ compptr->component_index = ci;
+ INPUT_BYTE(cinfo, compptr->component_id, return FALSE);
+ INPUT_BYTE(cinfo, c, return FALSE);
+ compptr->h_samp_factor = (c >> 4) & 15;
+ compptr->v_samp_factor = (c ) & 15;
+ INPUT_BYTE(cinfo, compptr->quant_tbl_no, return FALSE);
+
+ TRACEMS4(cinfo, 1, JTRC_SOF_COMPONENT,
+ compptr->component_id, compptr->h_samp_factor,
+ compptr->v_samp_factor, compptr->quant_tbl_no);
+ }
+
+ cinfo->marker->saw_SOF = TRUE;
+
+ INPUT_SYNC(cinfo);
+ return TRUE;
+}
+
+
+LOCAL(boolean)
+get_sos(j_decompress_ptr cinfo)
+/* Process a SOS marker */
+{
+ JLONG length;
+ int i, ci, n, c, cc, pi;
+ jpeg_component_info *compptr;
+ INPUT_VARS(cinfo);
+
+ if (!cinfo->marker->saw_SOF)
+ ERREXIT(cinfo, JERR_SOS_NO_SOF);
+
+ INPUT_2BYTES(cinfo, length, return FALSE);
+
+ INPUT_BYTE(cinfo, n, return FALSE); /* Number of components */
+
+ TRACEMS1(cinfo, 1, JTRC_SOS, n);
+
+ if (length != (n * 2 + 6) || n < 1 || n > MAX_COMPS_IN_SCAN)
+ ERREXIT(cinfo, JERR_BAD_LENGTH);
+
+ cinfo->comps_in_scan = n;
+
+ /* Collect the component-spec parameters */
+
+ for (i = 0; i < MAX_COMPS_IN_SCAN; i++)
+ cinfo->cur_comp_info[i] = NULL;
+
+ for (i = 0; i < n; i++) {
+ INPUT_BYTE(cinfo, cc, return FALSE);
+ INPUT_BYTE(cinfo, c, return FALSE);
+
+ for (ci = 0, compptr = cinfo->comp_info;
+ ci < cinfo->num_components && ci < MAX_COMPS_IN_SCAN;
+ ci++, compptr++) {
+ if (cc == compptr->component_id && !cinfo->cur_comp_info[ci])
+ goto id_found;
+ }
+
+ ERREXIT1(cinfo, JERR_BAD_COMPONENT_ID, cc);
+
+id_found:
+
+ cinfo->cur_comp_info[i] = compptr;
+ compptr->dc_tbl_no = (c >> 4) & 15;
+ compptr->ac_tbl_no = (c ) & 15;
+
+ TRACEMS3(cinfo, 1, JTRC_SOS_COMPONENT, cc,
+ compptr->dc_tbl_no, compptr->ac_tbl_no);
+
+ /* This CSi (cc) should differ from the previous CSi */
+ for (pi = 0; pi < i; pi++) {
+ if (cinfo->cur_comp_info[pi] == compptr) {
+ ERREXIT1(cinfo, JERR_BAD_COMPONENT_ID, cc);
+ }
+ }
+ }
+
+ /* Collect the additional scan parameters Ss, Se, Ah/Al. */
+ INPUT_BYTE(cinfo, c, return FALSE);
+ cinfo->Ss = c;
+ INPUT_BYTE(cinfo, c, return FALSE);
+ cinfo->Se = c;
+ INPUT_BYTE(cinfo, c, return FALSE);
+ cinfo->Ah = (c >> 4) & 15;
+ cinfo->Al = (c ) & 15;
+
+ TRACEMS4(cinfo, 1, JTRC_SOS_PARAMS, cinfo->Ss, cinfo->Se,
+ cinfo->Ah, cinfo->Al);
+
+ /* Prepare to scan data & restart markers */
+ cinfo->marker->next_restart_num = 0;
+
+ /* Count another SOS marker */
+ cinfo->input_scan_number++;
+
+ INPUT_SYNC(cinfo);
+ return TRUE;
+}
+
+
+#ifdef D_ARITH_CODING_SUPPORTED
+
+LOCAL(boolean)
+get_dac(j_decompress_ptr cinfo)
+/* Process a DAC marker */
+{
+ JLONG length;
+ int index, val;
+ INPUT_VARS(cinfo);
+
+ INPUT_2BYTES(cinfo, length, return FALSE);
+ length -= 2;
+
+ while (length > 0) {
+ INPUT_BYTE(cinfo, index, return FALSE);
+ INPUT_BYTE(cinfo, val, return FALSE);
+
+ length -= 2;
+
+ TRACEMS2(cinfo, 1, JTRC_DAC, index, val);
+
+ if (index < 0 || index >= (2 * NUM_ARITH_TBLS))
+ ERREXIT1(cinfo, JERR_DAC_INDEX, index);
+
+ if (index >= NUM_ARITH_TBLS) { /* define AC table */
+ cinfo->arith_ac_K[index - NUM_ARITH_TBLS] = (UINT8)val;
+ } else { /* define DC table */
+ cinfo->arith_dc_L[index] = (UINT8)(val & 0x0F);
+ cinfo->arith_dc_U[index] = (UINT8)(val >> 4);
+ if (cinfo->arith_dc_L[index] > cinfo->arith_dc_U[index])
+ ERREXIT1(cinfo, JERR_DAC_VALUE, val);
+ }
+ }
+
+ if (length != 0)
+ ERREXIT(cinfo, JERR_BAD_LENGTH);
+
+ INPUT_SYNC(cinfo);
+ return TRUE;
+}
+
+#else /* !D_ARITH_CODING_SUPPORTED */
+
+#define get_dac(cinfo) skip_variable(cinfo)
+
+#endif /* D_ARITH_CODING_SUPPORTED */
+
+
+LOCAL(boolean)
+get_dht(j_decompress_ptr cinfo)
+/* Process a DHT marker */
+{
+ JLONG length;
+ UINT8 bits[17];
+ UINT8 huffval[256];
+ int i, index, count;
+ JHUFF_TBL **htblptr;
+ INPUT_VARS(cinfo);
+
+ INPUT_2BYTES(cinfo, length, return FALSE);
+ length -= 2;
+
+ while (length > 16) {
+ INPUT_BYTE(cinfo, index, return FALSE);
+
+ TRACEMS1(cinfo, 1, JTRC_DHT, index);
+
+ bits[0] = 0;
+ count = 0;
+ for (i = 1; i <= 16; i++) {
+ INPUT_BYTE(cinfo, bits[i], return FALSE);
+ count += bits[i];
+ }
+
+ length -= 1 + 16;
+
+ TRACEMS8(cinfo, 2, JTRC_HUFFBITS,
+ bits[1], bits[2], bits[3], bits[4],
+ bits[5], bits[6], bits[7], bits[8]);
+ TRACEMS8(cinfo, 2, JTRC_HUFFBITS,
+ bits[9], bits[10], bits[11], bits[12],
+ bits[13], bits[14], bits[15], bits[16]);
+
+ /* Here we just do minimal validation of the counts to avoid walking
+ * off the end of our table space. jdhuff.c will check more carefully.
+ */
+ if (count > 256 || ((JLONG)count) > length)
+ ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
+
+ for (i = 0; i < count; i++)
+ INPUT_BYTE(cinfo, huffval[i], return FALSE);
+
+ memset(&huffval[count], 0, (256 - count) * sizeof(UINT8));
+
+ length -= count;
+
+ if (index & 0x10) { /* AC table definition */
+ index -= 0x10;
+ if (index < 0 || index >= NUM_HUFF_TBLS)
+ ERREXIT1(cinfo, JERR_DHT_INDEX, index);
+ htblptr = &cinfo->ac_huff_tbl_ptrs[index];
+ } else { /* DC table definition */
+ if (index < 0 || index >= NUM_HUFF_TBLS)
+ ERREXIT1(cinfo, JERR_DHT_INDEX, index);
+ htblptr = &cinfo->dc_huff_tbl_ptrs[index];
+ }
+
+ if (*htblptr == NULL)
+ *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
+
+ memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
+ memcpy((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
+ }
+
+ if (length != 0)
+ ERREXIT(cinfo, JERR_BAD_LENGTH);
+
+ INPUT_SYNC(cinfo);
+ return TRUE;
+}
+
+
+LOCAL(boolean)
+get_dqt(j_decompress_ptr cinfo)
+/* Process a DQT marker */
+{
+ JLONG length;
+ int n, i, prec;
+ unsigned int tmp;
+ JQUANT_TBL *quant_ptr;
+ INPUT_VARS(cinfo);
+
+ INPUT_2BYTES(cinfo, length, return FALSE);
+ length -= 2;
+
+ while (length > 0) {
+ INPUT_BYTE(cinfo, n, return FALSE);
+ prec = n >> 4;
+ n &= 0x0F;
+
+ TRACEMS2(cinfo, 1, JTRC_DQT, n, prec);
+
+ if (n >= NUM_QUANT_TBLS)
+ ERREXIT1(cinfo, JERR_DQT_INDEX, n);
+
+ if (cinfo->quant_tbl_ptrs[n] == NULL)
+ cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr)cinfo);
+ quant_ptr = cinfo->quant_tbl_ptrs[n];
+
+ for (i = 0; i < DCTSIZE2; i++) {
+ if (prec)
+ INPUT_2BYTES(cinfo, tmp, return FALSE);
+ else
+ INPUT_BYTE(cinfo, tmp, return FALSE);
+ /* We convert the zigzag-order table to natural array order. */
+ quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16)tmp;
+ }
+
+ if (cinfo->err->trace_level >= 2) {
+ for (i = 0; i < DCTSIZE2; i += 8) {
+ TRACEMS8(cinfo, 2, JTRC_QUANTVALS,
+ quant_ptr->quantval[i], quant_ptr->quantval[i + 1],
+ quant_ptr->quantval[i + 2], quant_ptr->quantval[i + 3],
+ quant_ptr->quantval[i + 4], quant_ptr->quantval[i + 5],
+ quant_ptr->quantval[i + 6], quant_ptr->quantval[i + 7]);
+ }
+ }
+
+ length -= DCTSIZE2 + 1;
+ if (prec) length -= DCTSIZE2;
+ }
+
+ if (length != 0)
+ ERREXIT(cinfo, JERR_BAD_LENGTH);
+
+ INPUT_SYNC(cinfo);
+ return TRUE;
+}
+
+
+LOCAL(boolean)
+get_dri(j_decompress_ptr cinfo)
+/* Process a DRI marker */
+{
+ JLONG length;
+ unsigned int tmp;
+ INPUT_VARS(cinfo);
+
+ INPUT_2BYTES(cinfo, length, return FALSE);
+
+ if (length != 4)
+ ERREXIT(cinfo, JERR_BAD_LENGTH);
+
+ INPUT_2BYTES(cinfo, tmp, return FALSE);
+
+ TRACEMS1(cinfo, 1, JTRC_DRI, tmp);
+
+ cinfo->restart_interval = tmp;
+
+ INPUT_SYNC(cinfo);
+ return TRUE;
+}
+
+
+/*
+ * Routines for processing APPn and COM markers.
+ * These are either saved in memory or discarded, per application request.
+ * APP0 and APP14 are specially checked to see if they are
+ * JFIF and Adobe markers, respectively.
+ */
+
+#define APP0_DATA_LEN 14 /* Length of interesting data in APP0 */
+#define APP14_DATA_LEN 12 /* Length of interesting data in APP14 */
+#define APPN_DATA_LEN 14 /* Must be the largest of the above!! */
+
+
+LOCAL(void)
+examine_app0(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
+ JLONG remaining)
+/* Examine first few bytes from an APP0.
+ * Take appropriate action if it is a JFIF marker.
+ * datalen is # of bytes at data[], remaining is length of rest of marker data.
+ */
+{
+ JLONG totallen = (JLONG)datalen + remaining;
+
+ if (datalen >= APP0_DATA_LEN &&
+ data[0] == 0x4A &&
+ data[1] == 0x46 &&
+ data[2] == 0x49 &&
+ data[3] == 0x46 &&
+ data[4] == 0) {
+ /* Found JFIF APP0 marker: save info */
+ cinfo->saw_JFIF_marker = TRUE;
+ cinfo->JFIF_major_version = data[5];
+ cinfo->JFIF_minor_version = data[6];
+ cinfo->density_unit = data[7];
+ cinfo->X_density = (data[8] << 8) + data[9];
+ cinfo->Y_density = (data[10] << 8) + data[11];
+ /* Check version.
+ * Major version must be 1, anything else signals an incompatible change.
+ * (We used to treat this as an error, but now it's a nonfatal warning,
+ * because some bozo at Hijaak couldn't read the spec.)
+ * Minor version should be 0..2, but process anyway if newer.
+ */
+ if (cinfo->JFIF_major_version != 1)
+ WARNMS2(cinfo, JWRN_JFIF_MAJOR,
+ cinfo->JFIF_major_version, cinfo->JFIF_minor_version);
+ /* Generate trace messages */
+ TRACEMS5(cinfo, 1, JTRC_JFIF,
+ cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
+ cinfo->X_density, cinfo->Y_density, cinfo->density_unit);
+ /* Validate thumbnail dimensions and issue appropriate messages */
+ if (data[12] | data[13])
+ TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL, data[12], data[13]);
+ totallen -= APP0_DATA_LEN;
+ if (totallen != ((JLONG)data[12] * (JLONG)data[13] * (JLONG)3))
+ TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int)totallen);
+ } else if (datalen >= 6 &&
+ data[0] == 0x4A &&
+ data[1] == 0x46 &&
+ data[2] == 0x58 &&
+ data[3] == 0x58 &&
+ data[4] == 0) {
+ /* Found JFIF "JFXX" extension APP0 marker */
+ /* The library doesn't actually do anything with these,
+ * but we try to produce a helpful trace message.
+ */
+ switch (data[5]) {
+ case 0x10:
+ TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int)totallen);
+ break;
+ case 0x11:
+ TRACEMS1(cinfo, 1, JTRC_THUMB_PALETTE, (int)totallen);
+ break;
+ case 0x13:
+ TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int)totallen);
+ break;
+ default:
+ TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION, data[5], (int)totallen);
+ break;
+ }
+ } else {
+ /* Start of APP0 does not match "JFIF" or "JFXX", or too short */
+ TRACEMS1(cinfo, 1, JTRC_APP0, (int)totallen);
+ }
+}
+
+
+LOCAL(void)
+examine_app14(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
+ JLONG remaining)
+/* Examine first few bytes from an APP14.
+ * Take appropriate action if it is an Adobe marker.
+ * datalen is # of bytes at data[], remaining is length of rest of marker data.
+ */
+{
+ unsigned int version, flags0, flags1, transform;
+
+ if (datalen >= APP14_DATA_LEN &&
+ data[0] == 0x41 &&
+ data[1] == 0x64 &&
+ data[2] == 0x6F &&
+ data[3] == 0x62 &&
+ data[4] == 0x65) {
+ /* Found Adobe APP14 marker */
+ version = (data[5] << 8) + data[6];
+ flags0 = (data[7] << 8) + data[8];
+ flags1 = (data[9] << 8) + data[10];
+ transform = data[11];
+ TRACEMS4(cinfo, 1, JTRC_ADOBE, version, flags0, flags1, transform);
+ cinfo->saw_Adobe_marker = TRUE;
+ cinfo->Adobe_transform = (UINT8)transform;
+ } else {
+ /* Start of APP14 does not match "Adobe", or too short */
+ TRACEMS1(cinfo, 1, JTRC_APP14, (int)(datalen + remaining));
+ }
+}
+
+
+METHODDEF(boolean)
+get_interesting_appn(j_decompress_ptr cinfo)
+/* Process an APP0 or APP14 marker without saving it */
+{
+ JLONG length;
+ JOCTET b[APPN_DATA_LEN];
+ unsigned int i, numtoread;
+ INPUT_VARS(cinfo);
+
+ INPUT_2BYTES(cinfo, length, return FALSE);
+ length -= 2;
+
+ /* get the interesting part of the marker data */
+ if (length >= APPN_DATA_LEN)
+ numtoread = APPN_DATA_LEN;
+ else if (length > 0)
+ numtoread = (unsigned int)length;
+ else
+ numtoread = 0;
+ for (i = 0; i < numtoread; i++)
+ INPUT_BYTE(cinfo, b[i], return FALSE);
+ length -= numtoread;
+
+ /* process it */
+ switch (cinfo->unread_marker) {
+ case M_APP0:
+ examine_app0(cinfo, (JOCTET *)b, numtoread, length);
+ break;
+ case M_APP14:
+ examine_app14(cinfo, (JOCTET *)b, numtoread, length);
+ break;
+ default:
+ /* can't get here unless jpeg_save_markers chooses wrong processor */
+ ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, cinfo->unread_marker);
+ break;
+ }
+
+ /* skip any remaining data -- could be lots */
+ INPUT_SYNC(cinfo);
+ if (length > 0)
+ (*cinfo->src->skip_input_data) (cinfo, (long)length);
+
+ return TRUE;
+}
+
+
+#ifdef SAVE_MARKERS_SUPPORTED
+
+METHODDEF(boolean)
+save_marker(j_decompress_ptr cinfo)
+/* Save an APPn or COM marker into the marker list */
+{
+ my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
+ jpeg_saved_marker_ptr cur_marker = marker->cur_marker;
+ unsigned int bytes_read, data_length;
+ JOCTET *data;
+ JLONG length = 0;
+ INPUT_VARS(cinfo);
+
+ if (cur_marker == NULL) {
+ /* begin reading a marker */
+ INPUT_2BYTES(cinfo, length, return FALSE);
+ length -= 2;
+ if (length >= 0) { /* watch out for bogus length word */
+ /* figure out how much we want to save */
+ unsigned int limit;
+ if (cinfo->unread_marker == (int)M_COM)
+ limit = marker->length_limit_COM;
+ else
+ limit = marker->length_limit_APPn[cinfo->unread_marker - (int)M_APP0];
+ if ((unsigned int)length < limit)
+ limit = (unsigned int)length;
+ /* allocate and initialize the marker item */
+ cur_marker = (jpeg_saved_marker_ptr)
+ (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(struct jpeg_marker_struct) + limit);
+ cur_marker->next = NULL;
+ cur_marker->marker = (UINT8)cinfo->unread_marker;
+ cur_marker->original_length = (unsigned int)length;
+ cur_marker->data_length = limit;
+ /* data area is just beyond the jpeg_marker_struct */
+ data = cur_marker->data = (JOCTET *)(cur_marker + 1);
+ marker->cur_marker = cur_marker;
+ marker->bytes_read = 0;
+ bytes_read = 0;
+ data_length = limit;
+ } else {
+ /* deal with bogus length word */
+ bytes_read = data_length = 0;
+ data = NULL;
+ }
+ } else {
+ /* resume reading a marker */
+ bytes_read = marker->bytes_read;
+ data_length = cur_marker->data_length;
+ data = cur_marker->data + bytes_read;
+ }
+
+ while (bytes_read < data_length) {
+ INPUT_SYNC(cinfo); /* move the restart point to here */
+ marker->bytes_read = bytes_read;
+ /* If there's not at least one byte in buffer, suspend */
+ MAKE_BYTE_AVAIL(cinfo, return FALSE);
+ /* Copy bytes with reasonable rapidity */
+ while (bytes_read < data_length && bytes_in_buffer > 0) {
+ *data++ = *next_input_byte++;
+ bytes_in_buffer--;
+ bytes_read++;
+ }
+ }
+
+ /* Done reading what we want to read */
+ if (cur_marker != NULL) { /* will be NULL if bogus length word */
+ /* Add new marker to end of list */
+ if (cinfo->marker_list == NULL) {
+ cinfo->marker_list = cur_marker;
+ } else {
+ jpeg_saved_marker_ptr prev = cinfo->marker_list;
+ while (prev->next != NULL)
+ prev = prev->next;
+ prev->next = cur_marker;
+ }
+ /* Reset pointer & calc remaining data length */
+ data = cur_marker->data;
+ length = cur_marker->original_length - data_length;
+ }
+ /* Reset to initial state for next marker */
+ marker->cur_marker = NULL;
+
+ /* Process the marker if interesting; else just make a generic trace msg */
+ switch (cinfo->unread_marker) {
+ case M_APP0:
+ examine_app0(cinfo, data, data_length, length);
+ break;
+ case M_APP14:
+ examine_app14(cinfo, data, data_length, length);
+ break;
+ default:
+ TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker,
+ (int)(data_length + length));
+ break;
+ }
+
+ /* skip any remaining data -- could be lots */
+ INPUT_SYNC(cinfo); /* do before skip_input_data */
+ if (length > 0)
+ (*cinfo->src->skip_input_data) (cinfo, (long)length);
+
+ return TRUE;
+}
+
+#endif /* SAVE_MARKERS_SUPPORTED */
+
+
+METHODDEF(boolean)
+skip_variable(j_decompress_ptr cinfo)
+/* Skip over an unknown or uninteresting variable-length marker */
+{
+ JLONG length;
+ INPUT_VARS(cinfo);
+
+ INPUT_2BYTES(cinfo, length, return FALSE);
+ length -= 2;
+
+ TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, (int)length);
+
+ INPUT_SYNC(cinfo); /* do before skip_input_data */
+ if (length > 0)
+ (*cinfo->src->skip_input_data) (cinfo, (long)length);
+
+ return TRUE;
+}
+
+
+/*
+ * Find the next JPEG marker, save it in cinfo->unread_marker.
+ * Returns FALSE if had to suspend before reaching a marker;
+ * in that case cinfo->unread_marker is unchanged.
+ *
+ * Note that the result might not be a valid marker code,
+ * but it will never be 0 or FF.
+ */
+
+LOCAL(boolean)
+next_marker(j_decompress_ptr cinfo)
+{
+ int c;
+ INPUT_VARS(cinfo);
+
+ for (;;) {
+ INPUT_BYTE(cinfo, c, return FALSE);
+ /* Skip any non-FF bytes.
+ * This may look a bit inefficient, but it will not occur in a valid file.
+ * We sync after each discarded byte so that a suspending data source
+ * can discard the byte from its buffer.
+ */
+ while (c != 0xFF) {
+ cinfo->marker->discarded_bytes++;
+ INPUT_SYNC(cinfo);
+ INPUT_BYTE(cinfo, c, return FALSE);
+ }
+ /* This loop swallows any duplicate FF bytes. Extra FFs are legal as
+ * pad bytes, so don't count them in discarded_bytes. We assume there
+ * will not be so many consecutive FF bytes as to overflow a suspending
+ * data source's input buffer.
+ */
+ do {
+ INPUT_BYTE(cinfo, c, return FALSE);
+ } while (c == 0xFF);
+ if (c != 0)
+ break; /* found a valid marker, exit loop */
+ /* Reach here if we found a stuffed-zero data sequence (FF/00).
+ * Discard it and loop back to try again.
+ */
+ cinfo->marker->discarded_bytes += 2;
+ INPUT_SYNC(cinfo);
+ }
+
+ if (cinfo->marker->discarded_bytes != 0) {
+ WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
+ cinfo->marker->discarded_bytes = 0;
+ }
+
+ cinfo->unread_marker = c;
+
+ INPUT_SYNC(cinfo);
+ return TRUE;
+}
+
+
+LOCAL(boolean)
+first_marker(j_decompress_ptr cinfo)
+/* Like next_marker, but used to obtain the initial SOI marker. */
+/* For this marker, we do not allow preceding garbage or fill; otherwise,
+ * we might well scan an entire input file before realizing it ain't JPEG.
+ * If an application wants to process non-JFIF files, it must seek to the
+ * SOI before calling the JPEG library.
+ */
+{
+ int c, c2;
+ INPUT_VARS(cinfo);
+
+ INPUT_BYTE(cinfo, c, return FALSE);
+ INPUT_BYTE(cinfo, c2, return FALSE);
+ if (c != 0xFF || c2 != (int)M_SOI)
+ ERREXIT2(cinfo, JERR_NO_SOI, c, c2);
+
+ cinfo->unread_marker = c2;
+
+ INPUT_SYNC(cinfo);
+ return TRUE;
+}
+
+
+/*
+ * Read markers until SOS or EOI.
+ *
+ * Returns same codes as are defined for jpeg_consume_input:
+ * JPEG_SUSPENDED, JPEG_REACHED_SOS, or JPEG_REACHED_EOI.
+ */
+
+METHODDEF(int)
+read_markers(j_decompress_ptr cinfo)
+{
+ /* Outer loop repeats once for each marker. */
+ for (;;) {
+ /* Collect the marker proper, unless we already did. */
+ /* NB: first_marker() enforces the requirement that SOI appear first. */
+ if (cinfo->unread_marker == 0) {
+ if (!cinfo->marker->saw_SOI) {
+ if (!first_marker(cinfo))
+ return JPEG_SUSPENDED;
+ } else {
+ if (!next_marker(cinfo))
+ return JPEG_SUSPENDED;
+ }
+ }
+ /* At this point cinfo->unread_marker contains the marker code and the
+ * input point is just past the marker proper, but before any parameters.
+ * A suspension will cause us to return with this state still true.
+ */
+ switch (cinfo->unread_marker) {
+ case M_SOI:
+ if (!get_soi(cinfo))
+ return JPEG_SUSPENDED;
+ break;
+
+ case M_SOF0: /* Baseline */
+ case M_SOF1: /* Extended sequential, Huffman */
+ if (!get_sof(cinfo, FALSE, FALSE))
+ return JPEG_SUSPENDED;
+ break;
+
+ case M_SOF2: /* Progressive, Huffman */
+ if (!get_sof(cinfo, TRUE, FALSE))
+ return JPEG_SUSPENDED;
+ break;
+
+ case M_SOF9: /* Extended sequential, arithmetic */
+ if (!get_sof(cinfo, FALSE, TRUE))
+ return JPEG_SUSPENDED;
+ break;
+
+ case M_SOF10: /* Progressive, arithmetic */
+ if (!get_sof(cinfo, TRUE, TRUE))
+ return JPEG_SUSPENDED;
+ break;
+
+ /* Currently unsupported SOFn types */
+ case M_SOF3: /* Lossless, Huffman */
+ case M_SOF5: /* Differential sequential, Huffman */
+ case M_SOF6: /* Differential progressive, Huffman */
+ case M_SOF7: /* Differential lossless, Huffman */
+ case M_JPG: /* Reserved for JPEG extensions */
+ case M_SOF11: /* Lossless, arithmetic */
+ case M_SOF13: /* Differential sequential, arithmetic */
+ case M_SOF14: /* Differential progressive, arithmetic */
+ case M_SOF15: /* Differential lossless, arithmetic */
+ ERREXIT1(cinfo, JERR_SOF_UNSUPPORTED, cinfo->unread_marker);
+ break;
+
+ case M_SOS:
+ if (!get_sos(cinfo))
+ return JPEG_SUSPENDED;
+ cinfo->unread_marker = 0; /* processed the marker */
+ return JPEG_REACHED_SOS;
+
+ case M_EOI:
+ TRACEMS(cinfo, 1, JTRC_EOI);
+ cinfo->unread_marker = 0; /* processed the marker */
+ return JPEG_REACHED_EOI;
+
+ case M_DAC:
+ if (!get_dac(cinfo))
+ return JPEG_SUSPENDED;
+ break;
+
+ case M_DHT:
+ if (!get_dht(cinfo))
+ return JPEG_SUSPENDED;
+ break;
+
+ case M_DQT:
+ if (!get_dqt(cinfo))
+ return JPEG_SUSPENDED;
+ break;
+
+ case M_DRI:
+ if (!get_dri(cinfo))
+ return JPEG_SUSPENDED;
+ break;
+
+ case M_APP0:
+ case M_APP1:
+ case M_APP2:
+ case M_APP3:
+ case M_APP4:
+ case M_APP5:
+ case M_APP6:
+ case M_APP7:
+ case M_APP8:
+ case M_APP9:
+ case M_APP10:
+ case M_APP11:
+ case M_APP12:
+ case M_APP13:
+ case M_APP14:
+ case M_APP15:
+ if (!(*((my_marker_ptr)cinfo->marker)->process_APPn[
+ cinfo->unread_marker - (int)M_APP0]) (cinfo))
+ return JPEG_SUSPENDED;
+ break;
+
+ case M_COM:
+ if (!(*((my_marker_ptr)cinfo->marker)->process_COM) (cinfo))
+ return JPEG_SUSPENDED;
+ break;
+
+ case M_RST0: /* these are all parameterless */
+ case M_RST1:
+ case M_RST2:
+ case M_RST3:
+ case M_RST4:
+ case M_RST5:
+ case M_RST6:
+ case M_RST7:
+ case M_TEM:
+ TRACEMS1(cinfo, 1, JTRC_PARMLESS_MARKER, cinfo->unread_marker);
+ break;
+
+ case M_DNL: /* Ignore DNL ... perhaps the wrong thing */
+ if (!skip_variable(cinfo))
+ return JPEG_SUSPENDED;
+ break;
+
+ default: /* must be DHP, EXP, JPGn, or RESn */
+ /* For now, we treat the reserved markers as fatal errors since they are
+ * likely to be used to signal incompatible JPEG Part 3 extensions.
+ * Once the JPEG 3 version-number marker is well defined, this code
+ * ought to change!
+ */
+ ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, cinfo->unread_marker);
+ break;
+ }
+ /* Successfully processed marker, so reset state variable */
+ cinfo->unread_marker = 0;
+ } /* end loop */
+}
+
+
+/*
+ * Read a restart marker, which is expected to appear next in the datastream;
+ * if the marker is not there, take appropriate recovery action.
+ * Returns FALSE if suspension is required.
+ *
+ * This is called by the entropy decoder after it has read an appropriate
+ * number of MCUs. cinfo->unread_marker may be nonzero if the entropy decoder
+ * has already read a marker from the data source. Under normal conditions
+ * cinfo->unread_marker will be reset to 0 before returning; if not reset,
+ * it holds a marker which the decoder will be unable to read past.
+ */
+
+METHODDEF(boolean)
+read_restart_marker(j_decompress_ptr cinfo)
+{
+ /* Obtain a marker unless we already did. */
+ /* Note that next_marker will complain if it skips any data. */
+ if (cinfo->unread_marker == 0) {
+ if (!next_marker(cinfo))
+ return FALSE;
+ }
+
+ if (cinfo->unread_marker ==
+ ((int)M_RST0 + cinfo->marker->next_restart_num)) {
+ /* Normal case --- swallow the marker and let entropy decoder continue */
+ TRACEMS1(cinfo, 3, JTRC_RST, cinfo->marker->next_restart_num);
+ cinfo->unread_marker = 0;
+ } else {
+ /* Uh-oh, the restart markers have been messed up. */
+ /* Let the data source manager determine how to resync. */
+ if (!(*cinfo->src->resync_to_restart) (cinfo,
+ cinfo->marker->next_restart_num))
+ return FALSE;
+ }
+
+ /* Update next-restart state */
+ cinfo->marker->next_restart_num = (cinfo->marker->next_restart_num + 1) & 7;
+
+ return TRUE;
+}
+
+
+/*
+ * This is the default resync_to_restart method for data source managers
+ * to use if they don't have any better approach. Some data source managers
+ * may be able to back up, or may have additional knowledge about the data
+ * which permits a more intelligent recovery strategy; such managers would
+ * presumably supply their own resync method.
+ *
+ * read_restart_marker calls resync_to_restart if it finds a marker other than
+ * the restart marker it was expecting. (This code is *not* used unless
+ * a nonzero restart interval has been declared.) cinfo->unread_marker is
+ * the marker code actually found (might be anything, except 0 or FF).
+ * The desired restart marker number (0..7) is passed as a parameter.
+ * This routine is supposed to apply whatever error recovery strategy seems
+ * appropriate in order to position the input stream to the next data segment.
+ * Note that cinfo->unread_marker is treated as a marker appearing before
+ * the current data-source input point; usually it should be reset to zero
+ * before returning.
+ * Returns FALSE if suspension is required.
+ *
+ * This implementation is substantially constrained by wanting to treat the
+ * input as a data stream; this means we can't back up. Therefore, we have
+ * only the following actions to work with:
+ * 1. Simply discard the marker and let the entropy decoder resume at next
+ * byte of file.
+ * 2. Read forward until we find another marker, discarding intervening
+ * data. (In theory we could look ahead within the current bufferload,
+ * without having to discard data if we don't find the desired marker.
+ * This idea is not implemented here, in part because it makes behavior
+ * dependent on buffer size and chance buffer-boundary positions.)
+ * 3. Leave the marker unread (by failing to zero cinfo->unread_marker).
+ * This will cause the entropy decoder to process an empty data segment,
+ * inserting dummy zeroes, and then we will reprocess the marker.
+ *
+ * #2 is appropriate if we think the desired marker lies ahead, while #3 is
+ * appropriate if the found marker is a future restart marker (indicating
+ * that we have missed the desired restart marker, probably because it got
+ * corrupted).
+ * We apply #2 or #3 if the found marker is a restart marker no more than
+ * two counts behind or ahead of the expected one. We also apply #2 if the
+ * found marker is not a legal JPEG marker code (it's certainly bogus data).
+ * If the found marker is a restart marker more than 2 counts away, we do #1
+ * (too much risk that the marker is erroneous; with luck we will be able to
+ * resync at some future point).
+ * For any valid non-restart JPEG marker, we apply #3. This keeps us from
+ * overrunning the end of a scan. An implementation limited to single-scan
+ * files might find it better to apply #2 for markers other than EOI, since
+ * any other marker would have to be bogus data in that case.
+ */
+
+GLOBAL(boolean)
+jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired)
+{
+ int marker = cinfo->unread_marker;
+ int action = 1;
+
+ /* Always put up a warning. */
+ WARNMS2(cinfo, JWRN_MUST_RESYNC, marker, desired);
+
+ /* Outer loop handles repeated decision after scanning forward. */
+ for (;;) {
+ if (marker < (int)M_SOF0)
+ action = 2; /* invalid marker */
+ else if (marker < (int)M_RST0 || marker > (int)M_RST7)
+ action = 3; /* valid non-restart marker */
+ else {
+ if (marker == ((int)M_RST0 + ((desired + 1) & 7)) ||
+ marker == ((int)M_RST0 + ((desired + 2) & 7)))
+ action = 3; /* one of the next two expected restarts */
+ else if (marker == ((int)M_RST0 + ((desired - 1) & 7)) ||
+ marker == ((int)M_RST0 + ((desired - 2) & 7)))
+ action = 2; /* a prior restart, so advance */
+ else
+ action = 1; /* desired restart or too far away */
+ }
+ TRACEMS2(cinfo, 4, JTRC_RECOVERY_ACTION, marker, action);
+ switch (action) {
+ case 1:
+ /* Discard marker and let entropy decoder resume processing. */
+ cinfo->unread_marker = 0;
+ return TRUE;
+ case 2:
+ /* Scan to the next marker, and repeat the decision loop. */
+ if (!next_marker(cinfo))
+ return FALSE;
+ marker = cinfo->unread_marker;
+ break;
+ case 3:
+ /* Return without advancing past this marker. */
+ /* Entropy decoder will be forced to process an empty segment. */
+ return TRUE;
+ }
+ } /* end loop */
+}
+
+
+/*
+ * Reset marker processing state to begin a fresh datastream.
+ */
+
+METHODDEF(void)
+reset_marker_reader(j_decompress_ptr cinfo)
+{
+ my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
+
+ cinfo->comp_info = NULL; /* until allocated by get_sof */
+ cinfo->input_scan_number = 0; /* no SOS seen yet */
+ cinfo->unread_marker = 0; /* no pending marker */
+ marker->pub.saw_SOI = FALSE; /* set internal state too */
+ marker->pub.saw_SOF = FALSE;
+ marker->pub.discarded_bytes = 0;
+ marker->cur_marker = NULL;
+}
+
+
+/*
+ * Initialize the marker reader module.
+ * This is called only once, when the decompression object is created.
+ */
+
+GLOBAL(void)
+jinit_marker_reader(j_decompress_ptr cinfo)
+{
+ my_marker_ptr marker;
+ int i;
+
+ /* Create subobject in permanent pool */
+ marker = (my_marker_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+ sizeof(my_marker_reader));
+ cinfo->marker = (struct jpeg_marker_reader *)marker;
+ /* Initialize public method pointers */
+ marker->pub.reset_marker_reader = reset_marker_reader;
+ marker->pub.read_markers = read_markers;
+ marker->pub.read_restart_marker = read_restart_marker;
+ /* Initialize COM/APPn processing.
+ * By default, we examine and then discard APP0 and APP14,
+ * but simply discard COM and all other APPn.
+ */
+ marker->process_COM = skip_variable;
+ marker->length_limit_COM = 0;
+ for (i = 0; i < 16; i++) {
+ marker->process_APPn[i] = skip_variable;
+ marker->length_limit_APPn[i] = 0;
+ }
+ marker->process_APPn[0] = get_interesting_appn;
+ marker->process_APPn[14] = get_interesting_appn;
+ /* Reset marker processing state */
+ reset_marker_reader(cinfo);
+}
+
+
+/*
+ * Control saving of COM and APPn markers into marker_list.
+ */
+
+#ifdef SAVE_MARKERS_SUPPORTED
+
+GLOBAL(void)
+jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+ unsigned int length_limit)
+{
+ my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
+ long maxlength;
+ jpeg_marker_parser_method processor;
+
+ /* Length limit mustn't be larger than what we can allocate
+ * (should only be a concern in a 16-bit environment).
+ */
+ maxlength = cinfo->mem->max_alloc_chunk - sizeof(struct jpeg_marker_struct);
+ if (((long)length_limit) > maxlength)
+ length_limit = (unsigned int)maxlength;
+
+ /* Choose processor routine to use.
+ * APP0/APP14 have special requirements.
+ */
+ if (length_limit) {
+ processor = save_marker;
+ /* If saving APP0/APP14, save at least enough for our internal use. */
+ if (marker_code == (int)M_APP0 && length_limit < APP0_DATA_LEN)
+ length_limit = APP0_DATA_LEN;
+ else if (marker_code == (int)M_APP14 && length_limit < APP14_DATA_LEN)
+ length_limit = APP14_DATA_LEN;
+ } else {
+ processor = skip_variable;
+ /* If discarding APP0/APP14, use our regular on-the-fly processor. */
+ if (marker_code == (int)M_APP0 || marker_code == (int)M_APP14)
+ processor = get_interesting_appn;
+ }
+
+ if (marker_code == (int)M_COM) {
+ marker->process_COM = processor;
+ marker->length_limit_COM = length_limit;
+ } else if (marker_code >= (int)M_APP0 && marker_code <= (int)M_APP15) {
+ marker->process_APPn[marker_code - (int)M_APP0] = processor;
+ marker->length_limit_APPn[marker_code - (int)M_APP0] = length_limit;
+ } else
+ ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, marker_code);
+}
+
+#endif /* SAVE_MARKERS_SUPPORTED */
+
+
+/*
+ * Install a special processing method for COM or APPn markers.
+ */
+
+GLOBAL(void)
+jpeg_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+ jpeg_marker_parser_method routine)
+{
+ my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
+
+ if (marker_code == (int)M_COM)
+ marker->process_COM = routine;
+ else if (marker_code >= (int)M_APP0 && marker_code <= (int)M_APP15)
+ marker->process_APPn[marker_code - (int)M_APP0] = routine;
+ else
+ ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, marker_code);
+}
diff --git a/media/libjpeg/jdmaster.c b/media/libjpeg/jdmaster.c
new file mode 100644
index 0000000000..a3690bf560
--- /dev/null
+++ b/media/libjpeg/jdmaster.c
@@ -0,0 +1,726 @@
+/*
+ * jdmaster.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009-2011, 2016, 2019, 2022, D. R. Commander.
+ * Copyright (C) 2013, Linaro Limited.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains master control logic for the JPEG decompressor.
+ * These routines are concerned with selecting the modules to be executed
+ * and with determining the number of passes and the work to be done in each
+ * pass.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jpegcomp.h"
+#include "jdmaster.h"
+
+
+/*
+ * Determine whether merged upsample/color conversion should be used.
+ * CRUCIAL: this must match the actual capabilities of jdmerge.c!
+ */
+
+LOCAL(boolean)
+use_merged_upsample(j_decompress_ptr cinfo)
+{
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+ /* Merging is the equivalent of plain box-filter upsampling */
+ if (cinfo->do_fancy_upsampling || cinfo->CCIR601_sampling)
+ return FALSE;
+ /* jdmerge.c only supports YCC=>RGB and YCC=>RGB565 color conversion */
+ if (cinfo->jpeg_color_space != JCS_YCbCr || cinfo->num_components != 3 ||
+ (cinfo->out_color_space != JCS_RGB &&
+ cinfo->out_color_space != JCS_RGB565 &&
+ cinfo->out_color_space != JCS_EXT_RGB &&
+ cinfo->out_color_space != JCS_EXT_RGBX &&
+ cinfo->out_color_space != JCS_EXT_BGR &&
+ cinfo->out_color_space != JCS_EXT_BGRX &&
+ cinfo->out_color_space != JCS_EXT_XBGR &&
+ cinfo->out_color_space != JCS_EXT_XRGB &&
+ cinfo->out_color_space != JCS_EXT_RGBA &&
+ cinfo->out_color_space != JCS_EXT_BGRA &&
+ cinfo->out_color_space != JCS_EXT_ABGR &&
+ cinfo->out_color_space != JCS_EXT_ARGB))
+ return FALSE;
+ if ((cinfo->out_color_space == JCS_RGB565 &&
+ cinfo->out_color_components != 3) ||
+ (cinfo->out_color_space != JCS_RGB565 &&
+ cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space]))
+ return FALSE;
+ /* and it only handles 2h1v or 2h2v sampling ratios */
+ if (cinfo->comp_info[0].h_samp_factor != 2 ||
+ cinfo->comp_info[1].h_samp_factor != 1 ||
+ cinfo->comp_info[2].h_samp_factor != 1 ||
+ cinfo->comp_info[0].v_samp_factor > 2 ||
+ cinfo->comp_info[1].v_samp_factor != 1 ||
+ cinfo->comp_info[2].v_samp_factor != 1)
+ return FALSE;
+ /* furthermore, it doesn't work if we've scaled the IDCTs differently */
+ if (cinfo->comp_info[0]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
+ cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
+ cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
+ return FALSE;
+ /* ??? also need to test for upsample-time rescaling, when & if supported */
+ return TRUE; /* by golly, it'll work... */
+#else
+ return FALSE;
+#endif
+}
+
+
+/*
+ * Compute output image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+#if JPEG_LIB_VERSION >= 80
+GLOBAL(void)
+#else
+LOCAL(void)
+#endif
+jpeg_core_output_dimensions(j_decompress_ptr cinfo)
+/* Do computations that are needed before master selection phase.
+ * This function is used for transcoding and full decompression.
+ */
+{
+#ifdef IDCT_SCALING_SUPPORTED
+ int ci;
+ jpeg_component_info *compptr;
+
+ /* Compute actual output image dimensions and DCT scaling choices. */
+ if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom) {
+ /* Provide 1/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 1;
+ cinfo->_min_DCT_v_scaled_size = 1;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 2) {
+ /* Provide 2/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 2L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 2L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 2;
+ cinfo->_min_DCT_v_scaled_size = 2;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 3) {
+ /* Provide 3/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 3L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 3L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 3;
+ cinfo->_min_DCT_v_scaled_size = 3;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 4) {
+ /* Provide 4/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 4L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 4L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 4;
+ cinfo->_min_DCT_v_scaled_size = 4;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 5) {
+ /* Provide 5/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 5L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 5L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 5;
+ cinfo->_min_DCT_v_scaled_size = 5;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 6) {
+ /* Provide 6/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 6L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 6L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 6;
+ cinfo->_min_DCT_v_scaled_size = 6;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 7) {
+ /* Provide 7/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 7L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 7L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 7;
+ cinfo->_min_DCT_v_scaled_size = 7;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 8) {
+ /* Provide 8/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 8L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 8L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 8;
+ cinfo->_min_DCT_v_scaled_size = 8;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 9) {
+ /* Provide 9/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 9L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 9L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 9;
+ cinfo->_min_DCT_v_scaled_size = 9;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 10) {
+ /* Provide 10/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 10L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 10L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 10;
+ cinfo->_min_DCT_v_scaled_size = 10;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 11) {
+ /* Provide 11/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 11L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 11L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 11;
+ cinfo->_min_DCT_v_scaled_size = 11;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 12) {
+ /* Provide 12/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 12L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 12L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 12;
+ cinfo->_min_DCT_v_scaled_size = 12;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 13) {
+ /* Provide 13/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 13L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 13L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 13;
+ cinfo->_min_DCT_v_scaled_size = 13;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 14) {
+ /* Provide 14/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 14L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 14L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 14;
+ cinfo->_min_DCT_v_scaled_size = 14;
+ } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 15) {
+ /* Provide 15/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 15L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 15L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 15;
+ cinfo->_min_DCT_v_scaled_size = 15;
+ } else {
+ /* Provide 16/block_size scaling */
+ cinfo->output_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width * 16L, (long)DCTSIZE);
+ cinfo->output_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height * 16L, (long)DCTSIZE);
+ cinfo->_min_DCT_h_scaled_size = 16;
+ cinfo->_min_DCT_v_scaled_size = 16;
+ }
+
+ /* Recompute dimensions of components */
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ compptr->_DCT_h_scaled_size = cinfo->_min_DCT_h_scaled_size;
+ compptr->_DCT_v_scaled_size = cinfo->_min_DCT_v_scaled_size;
+ }
+
+#else /* !IDCT_SCALING_SUPPORTED */
+
+ /* Hardwire it to "no scaling" */
+ cinfo->output_width = cinfo->image_width;
+ cinfo->output_height = cinfo->image_height;
+ /* jdinput.c has already initialized DCT_scaled_size,
+ * and has computed unscaled downsampled_width and downsampled_height.
+ */
+
+#endif /* IDCT_SCALING_SUPPORTED */
+}
+
+
+/*
+ * Compute output image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ * Also note that it may be called before the master module is initialized!
+ */
+
+GLOBAL(void)
+jpeg_calc_output_dimensions(j_decompress_ptr cinfo)
+/* Do computations that are needed before master selection phase */
+{
+#ifdef IDCT_SCALING_SUPPORTED
+ int ci;
+ jpeg_component_info *compptr;
+#endif
+
+ /* Prevent application from calling me at wrong times */
+ if (cinfo->global_state != DSTATE_READY)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ /* Compute core output image dimensions and DCT scaling choices. */
+ jpeg_core_output_dimensions(cinfo);
+
+#ifdef IDCT_SCALING_SUPPORTED
+
+ /* In selecting the actual DCT scaling for each component, we try to
+ * scale up the chroma components via IDCT scaling rather than upsampling.
+ * This saves time if the upsampler gets to use 1:1 scaling.
+ * Note this code adapts subsampling ratios which are powers of 2.
+ */
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ int ssize = cinfo->_min_DCT_scaled_size;
+ while (ssize < DCTSIZE &&
+ ((cinfo->max_h_samp_factor * cinfo->_min_DCT_scaled_size) %
+ (compptr->h_samp_factor * ssize * 2) == 0) &&
+ ((cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size) %
+ (compptr->v_samp_factor * ssize * 2) == 0)) {
+ ssize = ssize * 2;
+ }
+#if JPEG_LIB_VERSION >= 70
+ compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = ssize;
+#else
+ compptr->DCT_scaled_size = ssize;
+#endif
+ }
+
+ /* Recompute downsampled dimensions of components;
+ * application needs to know these if using raw downsampled data.
+ */
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ /* Size in samples, after IDCT scaling */
+ compptr->downsampled_width = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_width *
+ (long)(compptr->h_samp_factor * compptr->_DCT_scaled_size),
+ (long)(cinfo->max_h_samp_factor * DCTSIZE));
+ compptr->downsampled_height = (JDIMENSION)
+ jdiv_round_up((long)cinfo->image_height *
+ (long)(compptr->v_samp_factor * compptr->_DCT_scaled_size),
+ (long)(cinfo->max_v_samp_factor * DCTSIZE));
+ }
+
+#else /* !IDCT_SCALING_SUPPORTED */
+
+ /* Hardwire it to "no scaling" */
+ cinfo->output_width = cinfo->image_width;
+ cinfo->output_height = cinfo->image_height;
+ /* jdinput.c has already initialized DCT_scaled_size to DCTSIZE,
+ * and has computed unscaled downsampled_width and downsampled_height.
+ */
+
+#endif /* IDCT_SCALING_SUPPORTED */
+
+ /* Report number of components in selected colorspace. */
+ /* Probably this should be in the color conversion module... */
+ switch (cinfo->out_color_space) {
+ case JCS_GRAYSCALE:
+ cinfo->out_color_components = 1;
+ break;
+ case JCS_RGB:
+ case JCS_EXT_RGB:
+ case JCS_EXT_RGBX:
+ case JCS_EXT_BGR:
+ case JCS_EXT_BGRX:
+ case JCS_EXT_XBGR:
+ case JCS_EXT_XRGB:
+ case JCS_EXT_RGBA:
+ case JCS_EXT_BGRA:
+ case JCS_EXT_ABGR:
+ case JCS_EXT_ARGB:
+ cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space];
+ break;
+ case JCS_YCbCr:
+ case JCS_RGB565:
+ cinfo->out_color_components = 3;
+ break;
+ case JCS_CMYK:
+ case JCS_YCCK:
+ cinfo->out_color_components = 4;
+ break;
+ default: /* else must be same colorspace as in file */
+ cinfo->out_color_components = cinfo->num_components;
+ break;
+ }
+ cinfo->output_components = (cinfo->quantize_colors ? 1 :
+ cinfo->out_color_components);
+
+ /* See if upsampler will want to emit more than one row at a time */
+ if (use_merged_upsample(cinfo))
+ cinfo->rec_outbuf_height = cinfo->max_v_samp_factor;
+ else
+ cinfo->rec_outbuf_height = 1;
+}
+
+
+/*
+ * Several decompression processes need to range-limit values to the range
+ * 0..MAXJSAMPLE; the input value may fall somewhat outside this range
+ * due to noise introduced by quantization, roundoff error, etc. These
+ * processes are inner loops and need to be as fast as possible. On most
+ * machines, particularly CPUs with pipelines or instruction prefetch,
+ * a (subscript-check-less) C table lookup
+ * x = sample_range_limit[x];
+ * is faster than explicit tests
+ * if (x < 0) x = 0;
+ * else if (x > MAXJSAMPLE) x = MAXJSAMPLE;
+ * These processes all use a common table prepared by the routine below.
+ *
+ * For most steps we can mathematically guarantee that the initial value
+ * of x is within MAXJSAMPLE+1 of the legal range, so a table running from
+ * -(MAXJSAMPLE+1) to 2*MAXJSAMPLE+1 is sufficient. But for the initial
+ * limiting step (just after the IDCT), a wildly out-of-range value is
+ * possible if the input data is corrupt. To avoid any chance of indexing
+ * off the end of memory and getting a bad-pointer trap, we perform the
+ * post-IDCT limiting thus:
+ * x = range_limit[x & MASK];
+ * where MASK is 2 bits wider than legal sample data, ie 10 bits for 8-bit
+ * samples. Under normal circumstances this is more than enough range and
+ * a correct output will be generated; with bogus input data the mask will
+ * cause wraparound, and we will safely generate a bogus-but-in-range output.
+ * For the post-IDCT step, we want to convert the data from signed to unsigned
+ * representation by adding CENTERJSAMPLE at the same time that we limit it.
+ * So the post-IDCT limiting table ends up looking like this:
+ * CENTERJSAMPLE,CENTERJSAMPLE+1,...,MAXJSAMPLE,
+ * MAXJSAMPLE (repeat 2*(MAXJSAMPLE+1)-CENTERJSAMPLE times),
+ * 0 (repeat 2*(MAXJSAMPLE+1)-CENTERJSAMPLE times),
+ * 0,1,...,CENTERJSAMPLE-1
+ * Negative inputs select values from the upper half of the table after
+ * masking.
+ *
+ * We can save some space by overlapping the start of the post-IDCT table
+ * with the simpler range limiting table. The post-IDCT table begins at
+ * sample_range_limit + CENTERJSAMPLE.
+ */
+
+LOCAL(void)
+prepare_range_limit_table(j_decompress_ptr cinfo)
+/* Allocate and fill in the sample_range_limit table */
+{
+ JSAMPLE *table;
+ int i;
+
+ table = (JSAMPLE *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (5 * (MAXJSAMPLE + 1) + CENTERJSAMPLE) * sizeof(JSAMPLE));
+ table += (MAXJSAMPLE + 1); /* allow negative subscripts of simple table */
+ cinfo->sample_range_limit = table;
+ /* First segment of "simple" table: limit[x] = 0 for x < 0 */
+ memset(table - (MAXJSAMPLE + 1), 0, (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
+ /* Main part of "simple" table: limit[x] = x */
+ for (i = 0; i <= MAXJSAMPLE; i++)
+ table[i] = (JSAMPLE)i;
+ table += CENTERJSAMPLE; /* Point to where post-IDCT table starts */
+ /* End of simple table, rest of first half of post-IDCT table */
+ for (i = CENTERJSAMPLE; i < 2 * (MAXJSAMPLE + 1); i++)
+ table[i] = MAXJSAMPLE;
+ /* Second half of post-IDCT table */
+ memset(table + (2 * (MAXJSAMPLE + 1)), 0,
+ (2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
+ memcpy(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
+ cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
+}
+
+
+/*
+ * Master selection of decompression modules.
+ * This is done once at jpeg_start_decompress time. We determine
+ * which modules will be used and give them appropriate initialization calls.
+ * We also initialize the decompressor input side to begin consuming data.
+ *
+ * Since jpeg_read_header has finished, we know what is in the SOF
+ * and (first) SOS markers. We also have all the application parameter
+ * settings.
+ */
+
+LOCAL(void)
+master_selection(j_decompress_ptr cinfo)
+{
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+ boolean use_c_buffer;
+ long samplesperrow;
+ JDIMENSION jd_samplesperrow;
+
+ /* Initialize dimensions and other stuff */
+ jpeg_calc_output_dimensions(cinfo);
+ prepare_range_limit_table(cinfo);
+
+ /* Width of an output scanline must be representable as JDIMENSION. */
+ samplesperrow = (long)cinfo->output_width *
+ (long)cinfo->out_color_components;
+ jd_samplesperrow = (JDIMENSION)samplesperrow;
+ if ((long)jd_samplesperrow != samplesperrow)
+ ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+
+ /* Initialize my private state */
+ master->pass_number = 0;
+ master->using_merged_upsample = use_merged_upsample(cinfo);
+
+ /* Color quantizer selection */
+ master->quantizer_1pass = NULL;
+ master->quantizer_2pass = NULL;
+ /* No mode changes if not using buffered-image mode. */
+ if (!cinfo->quantize_colors || !cinfo->buffered_image) {
+ cinfo->enable_1pass_quant = FALSE;
+ cinfo->enable_external_quant = FALSE;
+ cinfo->enable_2pass_quant = FALSE;
+ }
+ if (cinfo->quantize_colors) {
+ if (cinfo->raw_data_out)
+ ERREXIT(cinfo, JERR_NOTIMPL);
+ /* 2-pass quantizer only works in 3-component color space. */
+ if (cinfo->out_color_components != 3) {
+ cinfo->enable_1pass_quant = TRUE;
+ cinfo->enable_external_quant = FALSE;
+ cinfo->enable_2pass_quant = FALSE;
+ cinfo->colormap = NULL;
+ } else if (cinfo->colormap != NULL) {
+ cinfo->enable_external_quant = TRUE;
+ } else if (cinfo->two_pass_quantize) {
+ cinfo->enable_2pass_quant = TRUE;
+ } else {
+ cinfo->enable_1pass_quant = TRUE;
+ }
+
+ if (cinfo->enable_1pass_quant) {
+#ifdef QUANT_1PASS_SUPPORTED
+ jinit_1pass_quantizer(cinfo);
+ master->quantizer_1pass = cinfo->cquantize;
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+ }
+
+ /* We use the 2-pass code to map to external colormaps. */
+ if (cinfo->enable_2pass_quant || cinfo->enable_external_quant) {
+#ifdef QUANT_2PASS_SUPPORTED
+ jinit_2pass_quantizer(cinfo);
+ master->quantizer_2pass = cinfo->cquantize;
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+ }
+ /* If both quantizers are initialized, the 2-pass one is left active;
+ * this is necessary for starting with quantization to an external map.
+ */
+ }
+
+ /* Post-processing: in particular, color conversion first */
+ if (!cinfo->raw_data_out) {
+ if (master->using_merged_upsample) {
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+ jinit_merged_upsampler(cinfo); /* does color conversion too */
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+ } else {
+ jinit_color_deconverter(cinfo);
+ jinit_upsampler(cinfo);
+ }
+ jinit_d_post_controller(cinfo, cinfo->enable_2pass_quant);
+ }
+ /* Inverse DCT */
+ jinit_inverse_dct(cinfo);
+ /* Entropy decoding: either Huffman or arithmetic coding. */
+ if (cinfo->arith_code) {
+#ifdef D_ARITH_CODING_SUPPORTED
+ jinit_arith_decoder(cinfo);
+#else
+ ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+#endif
+ } else {
+ if (cinfo->progressive_mode) {
+#ifdef D_PROGRESSIVE_SUPPORTED
+ jinit_phuff_decoder(cinfo);
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+ } else
+ jinit_huff_decoder(cinfo);
+ }
+
+ /* Initialize principal buffer controllers. */
+ use_c_buffer = cinfo->inputctl->has_multiple_scans || cinfo->buffered_image;
+ jinit_d_coef_controller(cinfo, use_c_buffer);
+
+ if (!cinfo->raw_data_out)
+ jinit_d_main_controller(cinfo, FALSE /* never need full buffer here */);
+
+ /* We can now tell the memory manager to allocate virtual arrays. */
+ (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
+
+ /* Initialize input side of decompressor to consume first scan. */
+ (*cinfo->inputctl->start_input_pass) (cinfo);
+
+ /* Set the first and last iMCU columns to decompress from single-scan images.
+ * By default, decompress all of the iMCU columns.
+ */
+ cinfo->master->first_iMCU_col = 0;
+ cinfo->master->last_iMCU_col = cinfo->MCUs_per_row - 1;
+ cinfo->master->last_good_iMCU_row = 0;
+
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+ /* If jpeg_start_decompress will read the whole file, initialize
+ * progress monitoring appropriately. The input step is counted
+ * as one pass.
+ */
+ if (cinfo->progress != NULL && !cinfo->buffered_image &&
+ cinfo->inputctl->has_multiple_scans) {
+ int nscans;
+ /* Estimate number of scans to set pass_limit. */
+ if (cinfo->progressive_mode) {
+ /* Arbitrarily estimate 2 interleaved DC scans + 3 AC scans/component. */
+ nscans = 2 + 3 * cinfo->num_components;
+ } else {
+ /* For a nonprogressive multiscan file, estimate 1 scan per component. */
+ nscans = cinfo->num_components;
+ }
+ cinfo->progress->pass_counter = 0L;
+ cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows * nscans;
+ cinfo->progress->completed_passes = 0;
+ cinfo->progress->total_passes = (cinfo->enable_2pass_quant ? 3 : 2);
+ /* Count the input pass as done */
+ master->pass_number++;
+ }
+#endif /* D_MULTISCAN_FILES_SUPPORTED */
+}
+
+
+/*
+ * Per-pass setup.
+ * This is called at the beginning of each output pass. We determine which
+ * modules will be active during this pass and give them appropriate
+ * start_pass calls. We also set is_dummy_pass to indicate whether this
+ * is a "real" output pass or a dummy pass for color quantization.
+ * (In the latter case, jdapistd.c will crank the pass to completion.)
+ */
+
+METHODDEF(void)
+prepare_for_output_pass(j_decompress_ptr cinfo)
+{
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+
+ if (master->pub.is_dummy_pass) {
+#ifdef QUANT_2PASS_SUPPORTED
+ /* Final pass of 2-pass quantization */
+ master->pub.is_dummy_pass = FALSE;
+ (*cinfo->cquantize->start_pass) (cinfo, FALSE);
+ (*cinfo->post->start_pass) (cinfo, JBUF_CRANK_DEST);
+ (*cinfo->main->start_pass) (cinfo, JBUF_CRANK_DEST);
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif /* QUANT_2PASS_SUPPORTED */
+ } else {
+ if (cinfo->quantize_colors && cinfo->colormap == NULL) {
+ /* Select new quantization method */
+ if (cinfo->two_pass_quantize && cinfo->enable_2pass_quant) {
+ cinfo->cquantize = master->quantizer_2pass;
+ master->pub.is_dummy_pass = TRUE;
+ } else if (cinfo->enable_1pass_quant) {
+ cinfo->cquantize = master->quantizer_1pass;
+ } else {
+ ERREXIT(cinfo, JERR_MODE_CHANGE);
+ }
+ }
+ (*cinfo->idct->start_pass) (cinfo);
+ (*cinfo->coef->start_output_pass) (cinfo);
+ if (!cinfo->raw_data_out) {
+ if (!master->using_merged_upsample)
+ (*cinfo->cconvert->start_pass) (cinfo);
+ (*cinfo->upsample->start_pass) (cinfo);
+ if (cinfo->quantize_colors)
+ (*cinfo->cquantize->start_pass) (cinfo, master->pub.is_dummy_pass);
+ (*cinfo->post->start_pass) (cinfo,
+ (master->pub.is_dummy_pass ? JBUF_SAVE_AND_PASS : JBUF_PASS_THRU));
+ (*cinfo->main->start_pass) (cinfo, JBUF_PASS_THRU);
+ }
+ }
+
+ /* Set up progress monitor's pass info if present */
+ if (cinfo->progress != NULL) {
+ cinfo->progress->completed_passes = master->pass_number;
+ cinfo->progress->total_passes = master->pass_number +
+ (master->pub.is_dummy_pass ? 2 : 1);
+ /* In buffered-image mode, we assume one more output pass if EOI not
+ * yet reached, but no more passes if EOI has been reached.
+ */
+ if (cinfo->buffered_image && !cinfo->inputctl->eoi_reached) {
+ cinfo->progress->total_passes += (cinfo->enable_2pass_quant ? 2 : 1);
+ }
+ }
+}
+
+
+/*
+ * Finish up at end of an output pass.
+ */
+
+METHODDEF(void)
+finish_output_pass(j_decompress_ptr cinfo)
+{
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+
+ if (cinfo->quantize_colors)
+ (*cinfo->cquantize->finish_pass) (cinfo);
+ master->pass_number++;
+}
+
+
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+
+/*
+ * Switch to a new external colormap between output passes.
+ */
+
+GLOBAL(void)
+jpeg_new_colormap(j_decompress_ptr cinfo)
+{
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+
+ /* Prevent application from calling me at wrong times */
+ if (cinfo->global_state != DSTATE_BUFIMAGE)
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+ if (cinfo->quantize_colors && cinfo->enable_external_quant &&
+ cinfo->colormap != NULL) {
+ /* Select 2-pass quantizer for external colormap use */
+ cinfo->cquantize = master->quantizer_2pass;
+ /* Notify quantizer of colormap change */
+ (*cinfo->cquantize->new_color_map) (cinfo);
+ master->pub.is_dummy_pass = FALSE; /* just in case */
+ } else
+ ERREXIT(cinfo, JERR_MODE_CHANGE);
+}
+
+#endif /* D_MULTISCAN_FILES_SUPPORTED */
+
+
+/*
+ * Initialize master decompression control and select active modules.
+ * This is performed at the start of jpeg_start_decompress.
+ */
+
+GLOBAL(void)
+jinit_master_decompress(j_decompress_ptr cinfo)
+{
+ my_master_ptr master = (my_master_ptr)cinfo->master;
+
+ master->pub.prepare_for_output_pass = prepare_for_output_pass;
+ master->pub.finish_output_pass = finish_output_pass;
+
+ master->pub.is_dummy_pass = FALSE;
+ master->pub.jinit_upsampler_no_alloc = FALSE;
+
+ master_selection(cinfo);
+}
diff --git a/media/libjpeg/jdmaster.h b/media/libjpeg/jdmaster.h
new file mode 100644
index 0000000000..76897e2820
--- /dev/null
+++ b/media/libjpeg/jdmaster.h
@@ -0,0 +1,28 @@
+/*
+ * jdmaster.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1995, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the master control structure for the JPEG decompressor.
+ */
+
+/* Private state */
+
+typedef struct {
+ struct jpeg_decomp_master pub; /* public fields */
+
+ int pass_number; /* # of passes completed */
+
+ boolean using_merged_upsample; /* TRUE if using merged upsample/cconvert */
+
+ /* Saved references to initialized quantizer modules,
+ * in case we need to switch modes.
+ */
+ struct jpeg_color_quantizer *quantizer_1pass;
+ struct jpeg_color_quantizer *quantizer_2pass;
+} my_decomp_master;
+
+typedef my_decomp_master *my_master_ptr;
diff --git a/media/libjpeg/jdmerge.c b/media/libjpeg/jdmerge.c
new file mode 100644
index 0000000000..38b002729c
--- /dev/null
+++ b/media/libjpeg/jdmerge.c
@@ -0,0 +1,587 @@
+/*
+ * jdmerge.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009, 2011, 2014-2015, 2020, D. R. Commander.
+ * Copyright (C) 2013, Linaro Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains code for merged upsampling/color conversion.
+ *
+ * This file combines functions from jdsample.c and jdcolor.c;
+ * read those files first to understand what's going on.
+ *
+ * When the chroma components are to be upsampled by simple replication
+ * (ie, box filtering), we can save some work in color conversion by
+ * calculating all the output pixels corresponding to a pair of chroma
+ * samples at one time. In the conversion equations
+ * R = Y + K1 * Cr
+ * G = Y + K2 * Cb + K3 * Cr
+ * B = Y + K4 * Cb
+ * only the Y term varies among the group of pixels corresponding to a pair
+ * of chroma samples, so the rest of the terms can be calculated just once.
+ * At typical sampling ratios, this eliminates half or three-quarters of the
+ * multiplications needed for color conversion.
+ *
+ * This file currently provides implementations for the following cases:
+ * YCbCr => RGB color conversion only.
+ * Sampling ratios of 2h1v or 2h2v.
+ * No scaling needed at upsample time.
+ * Corner-aligned (non-CCIR601) sampling alignment.
+ * Other special cases could be added, but in most applications these are
+ * the only common cases. (For uncommon cases we fall back on the more
+ * general code in jdsample.c and jdcolor.c.)
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdmerge.h"
+#include "jsimd.h"
+
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+
+
+#define SCALEBITS 16 /* speediest right-shift on some machines */
+#define ONE_HALF ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x) ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
+
+
+/* Include inline routines for colorspace extensions */
+
+#include "jdmrgext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define h2v1_merged_upsample_internal extrgb_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal extrgb_h2v2_merged_upsample_internal
+#include "jdmrgext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef h2v1_merged_upsample_internal
+#undef h2v2_merged_upsample_internal
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define h2v1_merged_upsample_internal extrgbx_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal extrgbx_h2v2_merged_upsample_internal
+#include "jdmrgext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef h2v1_merged_upsample_internal
+#undef h2v2_merged_upsample_internal
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define h2v1_merged_upsample_internal extbgr_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal extbgr_h2v2_merged_upsample_internal
+#include "jdmrgext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef h2v1_merged_upsample_internal
+#undef h2v2_merged_upsample_internal
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define h2v1_merged_upsample_internal extbgrx_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal extbgrx_h2v2_merged_upsample_internal
+#include "jdmrgext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef h2v1_merged_upsample_internal
+#undef h2v2_merged_upsample_internal
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define h2v1_merged_upsample_internal extxbgr_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal extxbgr_h2v2_merged_upsample_internal
+#include "jdmrgext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef h2v1_merged_upsample_internal
+#undef h2v2_merged_upsample_internal
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define h2v1_merged_upsample_internal extxrgb_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal extxrgb_h2v2_merged_upsample_internal
+#include "jdmrgext.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef h2v1_merged_upsample_internal
+#undef h2v2_merged_upsample_internal
+
+
+/*
+ * Initialize tables for YCC->RGB colorspace conversion.
+ * This is taken directly from jdcolor.c; see that file for more info.
+ */
+
+LOCAL(void)
+build_ycc_rgb_table(j_decompress_ptr cinfo)
+{
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+ int i;
+ JLONG x;
+ SHIFT_TEMPS
+
+ upsample->Cr_r_tab = (int *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(int));
+ upsample->Cb_b_tab = (int *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(int));
+ upsample->Cr_g_tab = (JLONG *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(JLONG));
+ upsample->Cb_g_tab = (JLONG *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (MAXJSAMPLE + 1) * sizeof(JLONG));
+
+ for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
+ /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
+ /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */
+ /* Cr=>R value is nearest int to 1.40200 * x */
+ upsample->Cr_r_tab[i] = (int)
+ RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
+ /* Cb=>B value is nearest int to 1.77200 * x */
+ upsample->Cb_b_tab[i] = (int)
+ RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
+ /* Cr=>G value is scaled-up -0.71414 * x */
+ upsample->Cr_g_tab[i] = (-FIX(0.71414)) * x;
+ /* Cb=>G value is scaled-up -0.34414 * x */
+ /* We also add in ONE_HALF so that need not do it in inner loop */
+ upsample->Cb_g_tab[i] = (-FIX(0.34414)) * x + ONE_HALF;
+ }
+}
+
+
+/*
+ * Initialize for an upsampling pass.
+ */
+
+METHODDEF(void)
+start_pass_merged_upsample(j_decompress_ptr cinfo)
+{
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+
+ /* Mark the spare buffer empty */
+ upsample->spare_full = FALSE;
+ /* Initialize total-height counter for detecting bottom of image */
+ upsample->rows_to_go = cinfo->output_height;
+}
+
+
+/*
+ * Control routine to do upsampling (and color conversion).
+ *
+ * The control routine just handles the row buffering considerations.
+ */
+
+METHODDEF(void)
+merged_2v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
+/* 2:1 vertical sampling case: may need a spare row. */
+{
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+ JSAMPROW work_ptrs[2];
+ JDIMENSION num_rows; /* number of rows returned to caller */
+
+ if (upsample->spare_full) {
+ /* If we have a spare row saved from a previous cycle, just return it. */
+ JDIMENSION size = upsample->out_row_width;
+ if (cinfo->out_color_space == JCS_RGB565)
+ size = cinfo->output_width * 2;
+ jcopy_sample_rows(&upsample->spare_row, 0, output_buf + *out_row_ctr, 0, 1,
+ size);
+ num_rows = 1;
+ upsample->spare_full = FALSE;
+ } else {
+ /* Figure number of rows to return to caller. */
+ num_rows = 2;
+ /* Not more than the distance to the end of the image. */
+ if (num_rows > upsample->rows_to_go)
+ num_rows = upsample->rows_to_go;
+ /* And not more than what the client can accept: */
+ out_rows_avail -= *out_row_ctr;
+ if (num_rows > out_rows_avail)
+ num_rows = out_rows_avail;
+ /* Create output pointer array for upsampler. */
+ work_ptrs[0] = output_buf[*out_row_ctr];
+ if (num_rows > 1) {
+ work_ptrs[1] = output_buf[*out_row_ctr + 1];
+ } else {
+ work_ptrs[1] = upsample->spare_row;
+ upsample->spare_full = TRUE;
+ }
+ /* Now do the upsampling. */
+ (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr, work_ptrs);
+ }
+
+ /* Adjust counts */
+ *out_row_ctr += num_rows;
+ upsample->rows_to_go -= num_rows;
+ /* When the buffer is emptied, declare this input row group consumed */
+ if (!upsample->spare_full)
+ (*in_row_group_ctr)++;
+}
+
+
+METHODDEF(void)
+merged_1v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
+/* 1:1 vertical sampling case: much easier, never need a spare row. */
+{
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+
+ /* Just do the upsampling. */
+ (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr,
+ output_buf + *out_row_ctr);
+ /* Adjust counts */
+ (*out_row_ctr)++;
+ (*in_row_group_ctr)++;
+}
+
+
+/*
+ * These are the routines invoked by the control routines to do
+ * the actual upsampling/conversion. One row group is processed per call.
+ *
+ * Note: since we may be writing directly into application-supplied buffers,
+ * we have to be honest about the output width; we can't assume the buffer
+ * has been rounded up to an even width.
+ */
+
+
+/*
+ * Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+ */
+
+METHODDEF(void)
+h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ extrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ extrgbx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_BGR:
+ extbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ extbgrx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ extxbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ extxrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ default:
+ h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ }
+}
+
+
+/*
+ * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+ */
+
+METHODDEF(void)
+h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ extrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ extrgbx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_BGR:
+ extbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ extbgrx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ extxbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ extxrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ default:
+ h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ break;
+ }
+}
+
+
+/*
+ * RGB565 conversion
+ */
+
+#define PACK_SHORT_565_LE(r, g, b) \
+ ((((r) << 8) & 0xF800) | (((g) << 3) & 0x7E0) | ((b) >> 3))
+#define PACK_SHORT_565_BE(r, g, b) \
+ (((r) & 0xF8) | ((g) >> 5) | (((g) << 11) & 0xE000) | (((b) << 5) & 0x1F00))
+
+#define PACK_TWO_PIXELS_LE(l, r) ((r << 16) | l)
+#define PACK_TWO_PIXELS_BE(l, r) ((l << 16) | r)
+
+#define WRITE_TWO_PIXELS_LE(addr, pixels) { \
+ ((INT16 *)(addr))[0] = (INT16)(pixels); \
+ ((INT16 *)(addr))[1] = (INT16)((pixels) >> 16); \
+}
+#define WRITE_TWO_PIXELS_BE(addr, pixels) { \
+ ((INT16 *)(addr))[1] = (INT16)(pixels); \
+ ((INT16 *)(addr))[0] = (INT16)((pixels) >> 16); \
+}
+
+#define DITHER_565_R(r, dither) ((r) + ((dither) & 0xFF))
+#define DITHER_565_G(g, dither) ((g) + (((dither) & 0xFF) >> 1))
+#define DITHER_565_B(b, dither) ((b) + ((dither) & 0xFF))
+
+
+/* Declarations for ordered dithering
+ *
+ * We use a 4x4 ordered dither array packed into 32 bits. This array is
+ * sufficient for dithering RGB888 to RGB565.
+ */
+
+#define DITHER_MASK 0x3
+#define DITHER_ROTATE(x) ((((x) & 0xFF) << 24) | (((x) >> 8) & 0x00FFFFFF))
+static const JLONG dither_matrix[4] = {
+ 0x0008020A,
+ 0x0C040E06,
+ 0x030B0109,
+ 0x0F070D05
+};
+
+
+/* Include inline routines for RGB565 conversion */
+
+#define PACK_SHORT_565 PACK_SHORT_565_LE
+#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE
+#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_LE
+#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_le
+#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_le
+#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_le
+#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_le
+#include "jdmrg565.c"
+#undef PACK_SHORT_565
+#undef PACK_TWO_PIXELS
+#undef WRITE_TWO_PIXELS
+#undef h2v1_merged_upsample_565_internal
+#undef h2v1_merged_upsample_565D_internal
+#undef h2v2_merged_upsample_565_internal
+#undef h2v2_merged_upsample_565D_internal
+
+#define PACK_SHORT_565 PACK_SHORT_565_BE
+#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE
+#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_BE
+#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_be
+#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_be
+#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_be
+#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_be
+#include "jdmrg565.c"
+#undef PACK_SHORT_565
+#undef PACK_TWO_PIXELS
+#undef WRITE_TWO_PIXELS
+#undef h2v1_merged_upsample_565_internal
+#undef h2v1_merged_upsample_565D_internal
+#undef h2v2_merged_upsample_565_internal
+#undef h2v2_merged_upsample_565D_internal
+
+
+static INLINE boolean is_big_endian(void)
+{
+ int test_value = 1;
+ if (*(char *)&test_value != 1)
+ return TRUE;
+ return FALSE;
+}
+
+
+METHODDEF(void)
+h2v1_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ if (is_big_endian())
+ h2v1_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ else
+ h2v1_merged_upsample_565_le(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+}
+
+
+METHODDEF(void)
+h2v1_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ if (is_big_endian())
+ h2v1_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ else
+ h2v1_merged_upsample_565D_le(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+}
+
+
+METHODDEF(void)
+h2v2_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ if (is_big_endian())
+ h2v2_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ else
+ h2v2_merged_upsample_565_le(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+}
+
+
+METHODDEF(void)
+h2v2_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ if (is_big_endian())
+ h2v2_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+ else
+ h2v2_merged_upsample_565D_le(cinfo, input_buf, in_row_group_ctr,
+ output_buf);
+}
+
+
+/*
+ * Module initialization routine for merged upsampling/color conversion.
+ *
+ * NB: this is called under the conditions determined by use_merged_upsample()
+ * in jdmaster.c. That routine MUST correspond to the actual capabilities
+ * of this module; no safety checks are made here.
+ */
+
+GLOBAL(void)
+jinit_merged_upsampler(j_decompress_ptr cinfo)
+{
+ my_merged_upsample_ptr upsample;
+
+ upsample = (my_merged_upsample_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_merged_upsampler));
+ cinfo->upsample = (struct jpeg_upsampler *)upsample;
+ upsample->pub.start_pass = start_pass_merged_upsample;
+ upsample->pub.need_context_rows = FALSE;
+
+ upsample->out_row_width = cinfo->output_width * cinfo->out_color_components;
+
+ if (cinfo->max_v_samp_factor == 2) {
+ upsample->pub.upsample = merged_2v_upsample;
+ if (jsimd_can_h2v2_merged_upsample())
+ upsample->upmethod = jsimd_h2v2_merged_upsample;
+ else
+ upsample->upmethod = h2v2_merged_upsample;
+ if (cinfo->out_color_space == JCS_RGB565) {
+ if (cinfo->dither_mode != JDITHER_NONE) {
+ upsample->upmethod = h2v2_merged_upsample_565D;
+ } else {
+ upsample->upmethod = h2v2_merged_upsample_565;
+ }
+ }
+ /* Allocate a spare row buffer */
+ upsample->spare_row = (JSAMPROW)
+ (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (size_t)(upsample->out_row_width * sizeof(JSAMPLE)));
+ } else {
+ upsample->pub.upsample = merged_1v_upsample;
+ if (jsimd_can_h2v1_merged_upsample())
+ upsample->upmethod = jsimd_h2v1_merged_upsample;
+ else
+ upsample->upmethod = h2v1_merged_upsample;
+ if (cinfo->out_color_space == JCS_RGB565) {
+ if (cinfo->dither_mode != JDITHER_NONE) {
+ upsample->upmethod = h2v1_merged_upsample_565D;
+ } else {
+ upsample->upmethod = h2v1_merged_upsample_565;
+ }
+ }
+ /* No spare row needed */
+ upsample->spare_row = NULL;
+ }
+
+ build_ycc_rgb_table(cinfo);
+}
+
+#endif /* UPSAMPLE_MERGING_SUPPORTED */
diff --git a/media/libjpeg/jdmerge.h b/media/libjpeg/jdmerge.h
new file mode 100644
index 0000000000..b583396b10
--- /dev/null
+++ b/media/libjpeg/jdmerge.h
@@ -0,0 +1,47 @@
+/*
+ * jdmerge.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#define JPEG_INTERNALS
+#include "jpeglib.h"
+
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+
+
+/* Private subobject */
+
+typedef struct {
+ struct jpeg_upsampler pub; /* public fields */
+
+ /* Pointer to routine to do actual upsampling/conversion of one row group */
+ void (*upmethod) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+
+ /* Private state for YCC->RGB conversion */
+ int *Cr_r_tab; /* => table for Cr to R conversion */
+ int *Cb_b_tab; /* => table for Cb to B conversion */
+ JLONG *Cr_g_tab; /* => table for Cr to G conversion */
+ JLONG *Cb_g_tab; /* => table for Cb to G conversion */
+
+ /* For 2:1 vertical sampling, we produce two output rows at a time.
+ * We need a "spare" row buffer to hold the second output row if the
+ * application provides just a one-row buffer; we also use the spare
+ * to discard the dummy last row if the image height is odd.
+ */
+ JSAMPROW spare_row;
+ boolean spare_full; /* T if spare buffer is occupied */
+
+ JDIMENSION out_row_width; /* samples per output row */
+ JDIMENSION rows_to_go; /* counts rows remaining in image */
+} my_merged_upsampler;
+
+typedef my_merged_upsampler *my_merged_upsample_ptr;
+
+#endif /* UPSAMPLE_MERGING_SUPPORTED */
diff --git a/media/libjpeg/jdmrg565.c b/media/libjpeg/jdmrg565.c
new file mode 100644
index 0000000000..980a4e216e
--- /dev/null
+++ b/media/libjpeg/jdmrg565.c
@@ -0,0 +1,354 @@
+/*
+ * jdmrg565.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2013, Linaro Limited.
+ * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains code for merged upsampling/color conversion.
+ */
+
+
+INLINE
+LOCAL(void)
+h2v1_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+ register int y, cred, cgreen, cblue;
+ int cb, cr;
+ register JSAMPROW outptr;
+ JSAMPROW inptr0, inptr1, inptr2;
+ JDIMENSION col;
+ /* copy these pointers into registers if possible */
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *Crrtab = upsample->Cr_r_tab;
+ int *Cbbtab = upsample->Cb_b_tab;
+ JLONG *Crgtab = upsample->Cr_g_tab;
+ JLONG *Cbgtab = upsample->Cb_g_tab;
+ unsigned int r, g, b;
+ JLONG rgb;
+ SHIFT_TEMPS
+
+ inptr0 = input_buf[0][in_row_group_ctr];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ /* Loop for each pair of output pixels */
+ for (col = cinfo->output_width >> 1; col > 0; col--) {
+ /* Do the chroma part of the calculation */
+ cb = *inptr1++;
+ cr = *inptr2++;
+ cred = Crrtab[cr];
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cblue = Cbbtab[cb];
+
+ /* Fetch 2 Y values and emit 2 pixels */
+ y = *inptr0++;
+ r = range_limit[y + cred];
+ g = range_limit[y + cgreen];
+ b = range_limit[y + cblue];
+ rgb = PACK_SHORT_565(r, g, b);
+
+ y = *inptr0++;
+ r = range_limit[y + cred];
+ g = range_limit[y + cgreen];
+ b = range_limit[y + cblue];
+ rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+ WRITE_TWO_PIXELS(outptr, rgb);
+ outptr += 4;
+ }
+
+ /* If image width is odd, do the last output column separately */
+ if (cinfo->output_width & 1) {
+ cb = *inptr1;
+ cr = *inptr2;
+ cred = Crrtab[cr];
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cblue = Cbbtab[cb];
+ y = *inptr0;
+ r = range_limit[y + cred];
+ g = range_limit[y + cgreen];
+ b = range_limit[y + cblue];
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr = (INT16)rgb;
+ }
+}
+
+
+INLINE
+LOCAL(void)
+h2v1_merged_upsample_565D_internal(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+ register int y, cred, cgreen, cblue;
+ int cb, cr;
+ register JSAMPROW outptr;
+ JSAMPROW inptr0, inptr1, inptr2;
+ JDIMENSION col;
+ /* copy these pointers into registers if possible */
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *Crrtab = upsample->Cr_r_tab;
+ int *Cbbtab = upsample->Cb_b_tab;
+ JLONG *Crgtab = upsample->Cr_g_tab;
+ JLONG *Cbgtab = upsample->Cb_g_tab;
+ JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+ unsigned int r, g, b;
+ JLONG rgb;
+ SHIFT_TEMPS
+
+ inptr0 = input_buf[0][in_row_group_ctr];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ /* Loop for each pair of output pixels */
+ for (col = cinfo->output_width >> 1; col > 0; col--) {
+ /* Do the chroma part of the calculation */
+ cb = *inptr1++;
+ cr = *inptr2++;
+ cred = Crrtab[cr];
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cblue = Cbbtab[cb];
+
+ /* Fetch 2 Y values and emit 2 pixels */
+ y = *inptr0++;
+ r = range_limit[DITHER_565_R(y + cred, d0)];
+ g = range_limit[DITHER_565_G(y + cgreen, d0)];
+ b = range_limit[DITHER_565_B(y + cblue, d0)];
+ d0 = DITHER_ROTATE(d0);
+ rgb = PACK_SHORT_565(r, g, b);
+
+ y = *inptr0++;
+ r = range_limit[DITHER_565_R(y + cred, d0)];
+ g = range_limit[DITHER_565_G(y + cgreen, d0)];
+ b = range_limit[DITHER_565_B(y + cblue, d0)];
+ d0 = DITHER_ROTATE(d0);
+ rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+ WRITE_TWO_PIXELS(outptr, rgb);
+ outptr += 4;
+ }
+
+ /* If image width is odd, do the last output column separately */
+ if (cinfo->output_width & 1) {
+ cb = *inptr1;
+ cr = *inptr2;
+ cred = Crrtab[cr];
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cblue = Cbbtab[cb];
+ y = *inptr0;
+ r = range_limit[DITHER_565_R(y + cred, d0)];
+ g = range_limit[DITHER_565_G(y + cgreen, d0)];
+ b = range_limit[DITHER_565_B(y + cblue, d0)];
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr = (INT16)rgb;
+ }
+}
+
+
+INLINE
+LOCAL(void)
+h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+ register int y, cred, cgreen, cblue;
+ int cb, cr;
+ register JSAMPROW outptr0, outptr1;
+ JSAMPROW inptr00, inptr01, inptr1, inptr2;
+ JDIMENSION col;
+ /* copy these pointers into registers if possible */
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *Crrtab = upsample->Cr_r_tab;
+ int *Cbbtab = upsample->Cb_b_tab;
+ JLONG *Crgtab = upsample->Cr_g_tab;
+ JLONG *Cbgtab = upsample->Cb_g_tab;
+ unsigned int r, g, b;
+ JLONG rgb;
+ SHIFT_TEMPS
+
+ inptr00 = input_buf[0][in_row_group_ctr * 2];
+ inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr0 = output_buf[0];
+ outptr1 = output_buf[1];
+
+ /* Loop for each group of output pixels */
+ for (col = cinfo->output_width >> 1; col > 0; col--) {
+ /* Do the chroma part of the calculation */
+ cb = *inptr1++;
+ cr = *inptr2++;
+ cred = Crrtab[cr];
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cblue = Cbbtab[cb];
+
+ /* Fetch 4 Y values and emit 4 pixels */
+ y = *inptr00++;
+ r = range_limit[y + cred];
+ g = range_limit[y + cgreen];
+ b = range_limit[y + cblue];
+ rgb = PACK_SHORT_565(r, g, b);
+
+ y = *inptr00++;
+ r = range_limit[y + cred];
+ g = range_limit[y + cgreen];
+ b = range_limit[y + cblue];
+ rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+ WRITE_TWO_PIXELS(outptr0, rgb);
+ outptr0 += 4;
+
+ y = *inptr01++;
+ r = range_limit[y + cred];
+ g = range_limit[y + cgreen];
+ b = range_limit[y + cblue];
+ rgb = PACK_SHORT_565(r, g, b);
+
+ y = *inptr01++;
+ r = range_limit[y + cred];
+ g = range_limit[y + cgreen];
+ b = range_limit[y + cblue];
+ rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+ WRITE_TWO_PIXELS(outptr1, rgb);
+ outptr1 += 4;
+ }
+
+ /* If image width is odd, do the last output column separately */
+ if (cinfo->output_width & 1) {
+ cb = *inptr1;
+ cr = *inptr2;
+ cred = Crrtab[cr];
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cblue = Cbbtab[cb];
+
+ y = *inptr00;
+ r = range_limit[y + cred];
+ g = range_limit[y + cgreen];
+ b = range_limit[y + cblue];
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr0 = (INT16)rgb;
+
+ y = *inptr01;
+ r = range_limit[y + cred];
+ g = range_limit[y + cgreen];
+ b = range_limit[y + cblue];
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr1 = (INT16)rgb;
+ }
+}
+
+
+INLINE
+LOCAL(void)
+h2v2_merged_upsample_565D_internal(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+ register int y, cred, cgreen, cblue;
+ int cb, cr;
+ register JSAMPROW outptr0, outptr1;
+ JSAMPROW inptr00, inptr01, inptr1, inptr2;
+ JDIMENSION col;
+ /* copy these pointers into registers if possible */
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *Crrtab = upsample->Cr_r_tab;
+ int *Cbbtab = upsample->Cb_b_tab;
+ JLONG *Crgtab = upsample->Cr_g_tab;
+ JLONG *Cbgtab = upsample->Cb_g_tab;
+ JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+ JLONG d1 = dither_matrix[(cinfo->output_scanline + 1) & DITHER_MASK];
+ unsigned int r, g, b;
+ JLONG rgb;
+ SHIFT_TEMPS
+
+ inptr00 = input_buf[0][in_row_group_ctr * 2];
+ inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr0 = output_buf[0];
+ outptr1 = output_buf[1];
+
+ /* Loop for each group of output pixels */
+ for (col = cinfo->output_width >> 1; col > 0; col--) {
+ /* Do the chroma part of the calculation */
+ cb = *inptr1++;
+ cr = *inptr2++;
+ cred = Crrtab[cr];
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cblue = Cbbtab[cb];
+
+ /* Fetch 4 Y values and emit 4 pixels */
+ y = *inptr00++;
+ r = range_limit[DITHER_565_R(y + cred, d0)];
+ g = range_limit[DITHER_565_G(y + cgreen, d0)];
+ b = range_limit[DITHER_565_B(y + cblue, d0)];
+ d0 = DITHER_ROTATE(d0);
+ rgb = PACK_SHORT_565(r, g, b);
+
+ y = *inptr00++;
+ r = range_limit[DITHER_565_R(y + cred, d0)];
+ g = range_limit[DITHER_565_G(y + cgreen, d0)];
+ b = range_limit[DITHER_565_B(y + cblue, d0)];
+ d0 = DITHER_ROTATE(d0);
+ rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+ WRITE_TWO_PIXELS(outptr0, rgb);
+ outptr0 += 4;
+
+ y = *inptr01++;
+ r = range_limit[DITHER_565_R(y + cred, d1)];
+ g = range_limit[DITHER_565_G(y + cgreen, d1)];
+ b = range_limit[DITHER_565_B(y + cblue, d1)];
+ d1 = DITHER_ROTATE(d1);
+ rgb = PACK_SHORT_565(r, g, b);
+
+ y = *inptr01++;
+ r = range_limit[DITHER_565_R(y + cred, d1)];
+ g = range_limit[DITHER_565_G(y + cgreen, d1)];
+ b = range_limit[DITHER_565_B(y + cblue, d1)];
+ d1 = DITHER_ROTATE(d1);
+ rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
+
+ WRITE_TWO_PIXELS(outptr1, rgb);
+ outptr1 += 4;
+ }
+
+ /* If image width is odd, do the last output column separately */
+ if (cinfo->output_width & 1) {
+ cb = *inptr1;
+ cr = *inptr2;
+ cred = Crrtab[cr];
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cblue = Cbbtab[cb];
+
+ y = *inptr00;
+ r = range_limit[DITHER_565_R(y + cred, d0)];
+ g = range_limit[DITHER_565_G(y + cgreen, d0)];
+ b = range_limit[DITHER_565_B(y + cblue, d0)];
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr0 = (INT16)rgb;
+
+ y = *inptr01;
+ r = range_limit[DITHER_565_R(y + cred, d1)];
+ g = range_limit[DITHER_565_G(y + cgreen, d1)];
+ b = range_limit[DITHER_565_B(y + cblue, d1)];
+ rgb = PACK_SHORT_565(r, g, b);
+ *(INT16 *)outptr1 = (INT16)rgb;
+ }
+}
diff --git a/media/libjpeg/jdmrgext.c b/media/libjpeg/jdmrgext.c
new file mode 100644
index 0000000000..038abc75d7
--- /dev/null
+++ b/media/libjpeg/jdmrgext.c
@@ -0,0 +1,184 @@
+/*
+ * jdmrgext.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2011, 2015, 2020, 2023, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains code for merged upsampling/color conversion.
+ */
+
+
+/* This file is included by jdmerge.c */
+
+
+/*
+ * Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+ */
+
+INLINE
+LOCAL(void)
+h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+ register int y, cred, cgreen, cblue;
+ int cb, cr;
+ register JSAMPROW outptr;
+ JSAMPROW inptr0, inptr1, inptr2;
+ JDIMENSION col;
+ /* copy these pointers into registers if possible */
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *Crrtab = upsample->Cr_r_tab;
+ int *Cbbtab = upsample->Cb_b_tab;
+ JLONG *Crgtab = upsample->Cr_g_tab;
+ JLONG *Cbgtab = upsample->Cb_g_tab;
+ SHIFT_TEMPS
+
+ inptr0 = input_buf[0][in_row_group_ctr];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr = output_buf[0];
+ /* Loop for each pair of output pixels */
+ for (col = cinfo->output_width >> 1; col > 0; col--) {
+ /* Do the chroma part of the calculation */
+ cb = *inptr1++;
+ cr = *inptr2++;
+ cred = Crrtab[cr];
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cblue = Cbbtab[cb];
+ /* Fetch 2 Y values and emit 2 pixels */
+ y = *inptr0++;
+ outptr[RGB_RED] = range_limit[y + cred];
+ outptr[RGB_GREEN] = range_limit[y + cgreen];
+ outptr[RGB_BLUE] = range_limit[y + cblue];
+#ifdef RGB_ALPHA
+ outptr[RGB_ALPHA] = MAXJSAMPLE;
+#endif
+ outptr += RGB_PIXELSIZE;
+ y = *inptr0++;
+ outptr[RGB_RED] = range_limit[y + cred];
+ outptr[RGB_GREEN] = range_limit[y + cgreen];
+ outptr[RGB_BLUE] = range_limit[y + cblue];
+#ifdef RGB_ALPHA
+ outptr[RGB_ALPHA] = MAXJSAMPLE;
+#endif
+ outptr += RGB_PIXELSIZE;
+ }
+ /* If image width is odd, do the last output column separately */
+ if (cinfo->output_width & 1) {
+ cb = *inptr1;
+ cr = *inptr2;
+ cred = Crrtab[cr];
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cblue = Cbbtab[cb];
+ y = *inptr0;
+ outptr[RGB_RED] = range_limit[y + cred];
+ outptr[RGB_GREEN] = range_limit[y + cgreen];
+ outptr[RGB_BLUE] = range_limit[y + cblue];
+#ifdef RGB_ALPHA
+ outptr[RGB_ALPHA] = MAXJSAMPLE;
+#endif
+ }
+}
+
+
+/*
+ * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+ */
+
+INLINE
+LOCAL(void)
+h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+ register int y, cred, cgreen, cblue;
+ int cb, cr;
+ register JSAMPROW outptr0, outptr1;
+ JSAMPROW inptr00, inptr01, inptr1, inptr2;
+ JDIMENSION col;
+ /* copy these pointers into registers if possible */
+ register JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *Crrtab = upsample->Cr_r_tab;
+ int *Cbbtab = upsample->Cb_b_tab;
+ JLONG *Crgtab = upsample->Cr_g_tab;
+ JLONG *Cbgtab = upsample->Cb_g_tab;
+ SHIFT_TEMPS
+
+ inptr00 = input_buf[0][in_row_group_ctr * 2];
+ inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr0 = output_buf[0];
+ outptr1 = output_buf[1];
+ /* Loop for each group of output pixels */
+ for (col = cinfo->output_width >> 1; col > 0; col--) {
+ /* Do the chroma part of the calculation */
+ cb = *inptr1++;
+ cr = *inptr2++;
+ cred = Crrtab[cr];
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cblue = Cbbtab[cb];
+ /* Fetch 4 Y values and emit 4 pixels */
+ y = *inptr00++;
+ outptr0[RGB_RED] = range_limit[y + cred];
+ outptr0[RGB_GREEN] = range_limit[y + cgreen];
+ outptr0[RGB_BLUE] = range_limit[y + cblue];
+#ifdef RGB_ALPHA
+ outptr0[RGB_ALPHA] = MAXJSAMPLE;
+#endif
+ outptr0 += RGB_PIXELSIZE;
+ y = *inptr00++;
+ outptr0[RGB_RED] = range_limit[y + cred];
+ outptr0[RGB_GREEN] = range_limit[y + cgreen];
+ outptr0[RGB_BLUE] = range_limit[y + cblue];
+#ifdef RGB_ALPHA
+ outptr0[RGB_ALPHA] = MAXJSAMPLE;
+#endif
+ outptr0 += RGB_PIXELSIZE;
+ y = *inptr01++;
+ outptr1[RGB_RED] = range_limit[y + cred];
+ outptr1[RGB_GREEN] = range_limit[y + cgreen];
+ outptr1[RGB_BLUE] = range_limit[y + cblue];
+#ifdef RGB_ALPHA
+ outptr1[RGB_ALPHA] = MAXJSAMPLE;
+#endif
+ outptr1 += RGB_PIXELSIZE;
+ y = *inptr01++;
+ outptr1[RGB_RED] = range_limit[y + cred];
+ outptr1[RGB_GREEN] = range_limit[y + cgreen];
+ outptr1[RGB_BLUE] = range_limit[y + cblue];
+#ifdef RGB_ALPHA
+ outptr1[RGB_ALPHA] = MAXJSAMPLE;
+#endif
+ outptr1 += RGB_PIXELSIZE;
+ }
+ /* If image width is odd, do the last output column separately */
+ if (cinfo->output_width & 1) {
+ cb = *inptr1;
+ cr = *inptr2;
+ cred = Crrtab[cr];
+ cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+ cblue = Cbbtab[cb];
+ y = *inptr00;
+ outptr0[RGB_RED] = range_limit[y + cred];
+ outptr0[RGB_GREEN] = range_limit[y + cgreen];
+ outptr0[RGB_BLUE] = range_limit[y + cblue];
+#ifdef RGB_ALPHA
+ outptr0[RGB_ALPHA] = MAXJSAMPLE;
+#endif
+ y = *inptr01;
+ outptr1[RGB_RED] = range_limit[y + cred];
+ outptr1[RGB_GREEN] = range_limit[y + cgreen];
+ outptr1[RGB_BLUE] = range_limit[y + cblue];
+#ifdef RGB_ALPHA
+ outptr1[RGB_ALPHA] = MAXJSAMPLE;
+#endif
+ }
+}
diff --git a/media/libjpeg/jdphuff.c b/media/libjpeg/jdphuff.c
new file mode 100644
index 0000000000..9680ebcbd0
--- /dev/null
+++ b/media/libjpeg/jdphuff.c
@@ -0,0 +1,679 @@
+/*
+ * jdphuff.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1995-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015-2016, 2018-2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains Huffman entropy decoding routines for progressive JPEG.
+ *
+ * Much of the complexity here has to do with supporting input suspension.
+ * If the data source module demands suspension, we want to be able to back
+ * up to the start of the current MCU. To do this, we copy state variables
+ * into local working storage, and update them back to the permanent
+ * storage only upon successful completion of an MCU.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdhuff.h" /* Declarations shared with jdhuff.c */
+#include <limits.h>
+
+
+#ifdef D_PROGRESSIVE_SUPPORTED
+
+/*
+ * Expanded entropy decoder object for progressive Huffman decoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+typedef struct {
+ unsigned int EOBRUN; /* remaining EOBs in EOBRUN */
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+ struct jpeg_entropy_decoder pub; /* public fields */
+
+ /* These fields are loaded into local variables at start of each MCU.
+ * In case of suspension, we exit WITHOUT updating them.
+ */
+ bitread_perm_state bitstate; /* Bit buffer at start of MCU */
+ savable_state saved; /* Other state at start of MCU */
+
+ /* These fields are NOT loaded into local working state. */
+ unsigned int restarts_to_go; /* MCUs left in this restart interval */
+
+ /* Pointers to derived tables (these workspaces have image lifespan) */
+ d_derived_tbl *derived_tbls[NUM_HUFF_TBLS];
+
+ d_derived_tbl *ac_derived_tbl; /* active table during an AC scan */
+} phuff_entropy_decoder;
+
+typedef phuff_entropy_decoder *phuff_entropy_ptr;
+
+/* Forward declarations */
+METHODDEF(boolean) decode_mcu_DC_first(j_decompress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_AC_first(j_decompress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_DC_refine(j_decompress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_AC_refine(j_decompress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+
+
+/*
+ * Initialize for a Huffman-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass_phuff_decoder(j_decompress_ptr cinfo)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ boolean is_DC_band, bad;
+ int ci, coefi, tbl;
+ d_derived_tbl **pdtbl;
+ int *coef_bit_ptr, *prev_coef_bit_ptr;
+ jpeg_component_info *compptr;
+
+ is_DC_band = (cinfo->Ss == 0);
+
+ /* Validate scan parameters */
+ bad = FALSE;
+ if (is_DC_band) {
+ if (cinfo->Se != 0)
+ bad = TRUE;
+ } else {
+ /* need not check Ss/Se < 0 since they came from unsigned bytes */
+ if (cinfo->Ss > cinfo->Se || cinfo->Se >= DCTSIZE2)
+ bad = TRUE;
+ /* AC scans may have only one component */
+ if (cinfo->comps_in_scan != 1)
+ bad = TRUE;
+ }
+ if (cinfo->Ah != 0) {
+ /* Successive approximation refinement scan: must have Al = Ah-1. */
+ if (cinfo->Al != cinfo->Ah - 1)
+ bad = TRUE;
+ }
+ if (cinfo->Al > 13) /* need not check for < 0 */
+ bad = TRUE;
+ /* Arguably the maximum Al value should be less than 13 for 8-bit precision,
+ * but the spec doesn't say so, and we try to be liberal about what we
+ * accept. Note: large Al values could result in out-of-range DC
+ * coefficients during early scans, leading to bizarre displays due to
+ * overflows in the IDCT math. But we won't crash.
+ */
+ if (bad)
+ ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+ cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+ /* Update progression status, and verify that scan order is legal.
+ * Note that inter-scan inconsistencies are treated as warnings
+ * not fatal errors ... not clear if this is right way to behave.
+ */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ int cindex = cinfo->cur_comp_info[ci]->component_index;
+ coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+ prev_coef_bit_ptr = &cinfo->coef_bits[cindex + cinfo->num_components][0];
+ if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
+ WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+ for (coefi = MIN(cinfo->Ss, 1); coefi <= MAX(cinfo->Se, 9); coefi++) {
+ if (cinfo->input_scan_number > 1)
+ prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
+ else
+ prev_coef_bit_ptr[coefi] = 0;
+ }
+ for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
+ int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
+ if (cinfo->Ah != expected)
+ WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+ coef_bit_ptr[coefi] = cinfo->Al;
+ }
+ }
+
+ /* Select MCU decoding routine */
+ if (cinfo->Ah == 0) {
+ if (is_DC_band)
+ entropy->pub.decode_mcu = decode_mcu_DC_first;
+ else
+ entropy->pub.decode_mcu = decode_mcu_AC_first;
+ } else {
+ if (is_DC_band)
+ entropy->pub.decode_mcu = decode_mcu_DC_refine;
+ else
+ entropy->pub.decode_mcu = decode_mcu_AC_refine;
+ }
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ /* Make sure requested tables are present, and compute derived tables.
+ * We may build same derived table more than once, but it's not expensive.
+ */
+ if (is_DC_band) {
+ if (cinfo->Ah == 0) { /* DC refinement needs no table */
+ tbl = compptr->dc_tbl_no;
+ pdtbl = (d_derived_tbl **)(entropy->derived_tbls) + tbl;
+ jpeg_make_d_derived_tbl(cinfo, TRUE, tbl, pdtbl);
+ }
+ } else {
+ tbl = compptr->ac_tbl_no;
+ pdtbl = (d_derived_tbl **)(entropy->derived_tbls) + tbl;
+ jpeg_make_d_derived_tbl(cinfo, FALSE, tbl, pdtbl);
+ /* remember the single active table */
+ entropy->ac_derived_tbl = entropy->derived_tbls[tbl];
+ }
+ /* Initialize DC predictions to 0 */
+ entropy->saved.last_dc_val[ci] = 0;
+ }
+
+ /* Initialize bitread state variables */
+ entropy->bitstate.bits_left = 0;
+ entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
+ entropy->pub.insufficient_data = FALSE;
+
+ /* Initialize private state variables */
+ entropy->saved.EOBRUN = 0;
+
+ /* Initialize restart counter */
+ entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Figure F.12: extend sign bit.
+ * On some machines, a shift and add will be faster than a table lookup.
+ */
+
+#define AVOID_TABLES
+#ifdef AVOID_TABLES
+
+#define NEG_1 ((unsigned)-1)
+#define HUFF_EXTEND(x, s) \
+ ((x) < (1 << ((s) - 1)) ? (x) + (((NEG_1) << (s)) + 1) : (x))
+
+#else
+
+#define HUFF_EXTEND(x, s) \
+ ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
+
+static const int extend_test[16] = { /* entry n is 2**(n-1) */
+ 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
+ 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+};
+
+static const int extend_offset[16] = { /* entry n is (-1 << n) + 1 */
+ 0, ((-1) << 1) + 1, ((-1) << 2) + 1, ((-1) << 3) + 1, ((-1) << 4) + 1,
+ ((-1) << 5) + 1, ((-1) << 6) + 1, ((-1) << 7) + 1, ((-1) << 8) + 1,
+ ((-1) << 9) + 1, ((-1) << 10) + 1, ((-1) << 11) + 1, ((-1) << 12) + 1,
+ ((-1) << 13) + 1, ((-1) << 14) + 1, ((-1) << 15) + 1
+};
+
+#endif /* AVOID_TABLES */
+
+
+/*
+ * Check for a restart marker & resynchronize decoder.
+ * Returns FALSE if must suspend.
+ */
+
+LOCAL(boolean)
+process_restart(j_decompress_ptr cinfo)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ int ci;
+
+ /* Throw away any unused bits remaining in bit buffer; */
+ /* include any full bytes in next_marker's count of discarded bytes */
+ cinfo->marker->discarded_bytes += entropy->bitstate.bits_left / 8;
+ entropy->bitstate.bits_left = 0;
+
+ /* Advance past the RSTn marker */
+ if (!(*cinfo->marker->read_restart_marker) (cinfo))
+ return FALSE;
+
+ /* Re-initialize DC predictions to 0 */
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++)
+ entropy->saved.last_dc_val[ci] = 0;
+ /* Re-init EOB run count, too */
+ entropy->saved.EOBRUN = 0;
+
+ /* Reset restart counter */
+ entropy->restarts_to_go = cinfo->restart_interval;
+
+ /* Reset out-of-data flag, unless read_restart_marker left us smack up
+ * against a marker. In that case we will end up treating the next data
+ * segment as empty, and we can avoid producing bogus output pixels by
+ * leaving the flag set.
+ */
+ if (cinfo->unread_marker == 0)
+ entropy->pub.insufficient_data = FALSE;
+
+ return TRUE;
+}
+
+
+/*
+ * Huffman MCU decoding.
+ * Each of these routines decodes and returns one MCU's worth of
+ * Huffman-compressed coefficients.
+ * The coefficients are reordered from zigzag order into natural array order,
+ * but are not dequantized.
+ *
+ * The i'th block of the MCU is stored into the block pointed to by
+ * MCU_data[i]. WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
+ *
+ * We return FALSE if data source requested suspension. In that case no
+ * changes have been made to permanent state. (Exception: some output
+ * coefficients may already have been assigned. This is harmless for
+ * spectral selection, since we'll just re-assign them on the next call.
+ * Successive approximation AC refinement has to be more careful, however.)
+ */
+
+/*
+ * MCU decoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ int Al = cinfo->Al;
+ register int s, r;
+ int blkn, ci;
+ JBLOCKROW block;
+ BITREAD_STATE_VARS;
+ savable_state state;
+ d_derived_tbl *tbl;
+ jpeg_component_info *compptr;
+
+ /* Process restart marker if needed; may have to suspend */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ if (!process_restart(cinfo))
+ return FALSE;
+ }
+
+ /* If we've run out of data, just leave the MCU set to zeroes.
+ * This way, we return uniform gray for the remainder of the segment.
+ */
+ if (!entropy->pub.insufficient_data) {
+
+ /* Load up working state */
+ BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+ state = entropy->saved;
+
+ /* Outer loop handles each block in the MCU */
+
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ block = MCU_data[blkn];
+ ci = cinfo->MCU_membership[blkn];
+ compptr = cinfo->cur_comp_info[ci];
+ tbl = entropy->derived_tbls[compptr->dc_tbl_no];
+
+ /* Decode a single block's worth of coefficients */
+
+ /* Section F.2.2.1: decode the DC coefficient difference */
+ HUFF_DECODE(s, br_state, tbl, return FALSE, label1);
+ if (s) {
+ CHECK_BIT_BUFFER(br_state, s, return FALSE);
+ r = GET_BITS(s);
+ s = HUFF_EXTEND(r, s);
+ }
+
+ /* Convert DC difference to actual value, update last_dc_val */
+ if ((state.last_dc_val[ci] >= 0 &&
+ s > INT_MAX - state.last_dc_val[ci]) ||
+ (state.last_dc_val[ci] < 0 && s < INT_MIN - state.last_dc_val[ci]))
+ ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+ s += state.last_dc_val[ci];
+ state.last_dc_val[ci] = s;
+ /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
+ (*block)[0] = (JCOEF)LEFT_SHIFT(s, Al);
+ }
+
+ /* Completed MCU, so update state */
+ BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+ entropy->saved = state;
+ }
+
+ /* Account for restart interval (no-op if not using restarts) */
+ if (cinfo->restart_interval)
+ entropy->restarts_to_go--;
+
+ return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ int Se = cinfo->Se;
+ int Al = cinfo->Al;
+ register int s, k, r;
+ unsigned int EOBRUN;
+ JBLOCKROW block;
+ BITREAD_STATE_VARS;
+ d_derived_tbl *tbl;
+
+ /* Process restart marker if needed; may have to suspend */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ if (!process_restart(cinfo))
+ return FALSE;
+ }
+
+ /* If we've run out of data, just leave the MCU set to zeroes.
+ * This way, we return uniform gray for the remainder of the segment.
+ */
+ if (!entropy->pub.insufficient_data) {
+
+ /* Load up working state.
+ * We can avoid loading/saving bitread state if in an EOB run.
+ */
+ EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
+
+ /* There is always only one block per MCU */
+
+ if (EOBRUN > 0) /* if it's a band of zeroes... */
+ EOBRUN--; /* ...process it now (we do nothing) */
+ else {
+ BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+ block = MCU_data[0];
+ tbl = entropy->ac_derived_tbl;
+
+ for (k = cinfo->Ss; k <= Se; k++) {
+ HUFF_DECODE(s, br_state, tbl, return FALSE, label2);
+ r = s >> 4;
+ s &= 15;
+ if (s) {
+ k += r;
+ CHECK_BIT_BUFFER(br_state, s, return FALSE);
+ r = GET_BITS(s);
+ s = HUFF_EXTEND(r, s);
+ /* Scale and output coefficient in natural (dezigzagged) order */
+ (*block)[jpeg_natural_order[k]] = (JCOEF)LEFT_SHIFT(s, Al);
+ } else {
+ if (r == 15) { /* ZRL */
+ k += 15; /* skip 15 zeroes in band */
+ } else { /* EOBr, run length is 2^r + appended bits */
+ EOBRUN = 1 << r;
+ if (r) { /* EOBr, r > 0 */
+ CHECK_BIT_BUFFER(br_state, r, return FALSE);
+ r = GET_BITS(r);
+ EOBRUN += r;
+ }
+ EOBRUN--; /* this band is processed at this moment */
+ break; /* force end-of-band */
+ }
+ }
+ }
+
+ BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+ }
+
+ /* Completed MCU, so update state */
+ entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
+ }
+
+ /* Account for restart interval (no-op if not using restarts) */
+ if (cinfo->restart_interval)
+ entropy->restarts_to_go--;
+
+ return TRUE;
+}
+
+
+/*
+ * MCU decoding for DC successive approximation refinement scan.
+ * Note: we assume such scans can be multi-component, although the spec
+ * is not very clear on the point.
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ int p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */
+ int blkn;
+ JBLOCKROW block;
+ BITREAD_STATE_VARS;
+
+ /* Process restart marker if needed; may have to suspend */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ if (!process_restart(cinfo))
+ return FALSE;
+ }
+
+ /* Not worth the cycles to check insufficient_data here,
+ * since we will not change the data anyway if we read zeroes.
+ */
+
+ /* Load up working state */
+ BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+
+ /* Outer loop handles each block in the MCU */
+
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ block = MCU_data[blkn];
+
+ /* Encoded data is simply the next bit of the two's-complement DC value */
+ CHECK_BIT_BUFFER(br_state, 1, return FALSE);
+ if (GET_BITS(1))
+ (*block)[0] |= p1;
+ /* Note: since we use |=, repeating the assignment later is safe */
+ }
+
+ /* Completed MCU, so update state */
+ BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+
+ /* Account for restart interval (no-op if not using restarts) */
+ if (cinfo->restart_interval)
+ entropy->restarts_to_go--;
+
+ return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ int Se = cinfo->Se;
+ int p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */
+ int m1 = (NEG_1) << cinfo->Al; /* -1 in the bit position being coded */
+ register int s, k, r;
+ unsigned int EOBRUN;
+ JBLOCKROW block;
+ JCOEFPTR thiscoef;
+ BITREAD_STATE_VARS;
+ d_derived_tbl *tbl;
+ int num_newnz;
+ int newnz_pos[DCTSIZE2];
+
+ /* Process restart marker if needed; may have to suspend */
+ if (cinfo->restart_interval) {
+ if (entropy->restarts_to_go == 0)
+ if (!process_restart(cinfo))
+ return FALSE;
+ }
+
+ /* If we've run out of data, don't modify the MCU.
+ */
+ if (!entropy->pub.insufficient_data) {
+
+ /* Load up working state */
+ BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+ EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
+
+ /* There is always only one block per MCU */
+ block = MCU_data[0];
+ tbl = entropy->ac_derived_tbl;
+
+ /* If we are forced to suspend, we must undo the assignments to any newly
+ * nonzero coefficients in the block, because otherwise we'd get confused
+ * next time about which coefficients were already nonzero.
+ * But we need not undo addition of bits to already-nonzero coefficients;
+ * instead, we can test the current bit to see if we already did it.
+ */
+ num_newnz = 0;
+
+ /* initialize coefficient loop counter to start of band */
+ k = cinfo->Ss;
+
+ if (EOBRUN == 0) {
+ for (; k <= Se; k++) {
+ HUFF_DECODE(s, br_state, tbl, goto undoit, label3);
+ r = s >> 4;
+ s &= 15;
+ if (s) {
+ if (s != 1) /* size of new coef should always be 1 */
+ WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
+ CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+ if (GET_BITS(1))
+ s = p1; /* newly nonzero coef is positive */
+ else
+ s = m1; /* newly nonzero coef is negative */
+ } else {
+ if (r != 15) {
+ EOBRUN = 1 << r; /* EOBr, run length is 2^r + appended bits */
+ if (r) {
+ CHECK_BIT_BUFFER(br_state, r, goto undoit);
+ r = GET_BITS(r);
+ EOBRUN += r;
+ }
+ break; /* rest of block is handled by EOB logic */
+ }
+ /* note s = 0 for processing ZRL */
+ }
+ /* Advance over already-nonzero coefs and r still-zero coefs,
+ * appending correction bits to the nonzeroes. A correction bit is 1
+ * if the absolute value of the coefficient must be increased.
+ */
+ do {
+ thiscoef = *block + jpeg_natural_order[k];
+ if (*thiscoef != 0) {
+ CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+ if (GET_BITS(1)) {
+ if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
+ if (*thiscoef >= 0)
+ *thiscoef += (JCOEF)p1;
+ else
+ *thiscoef += (JCOEF)m1;
+ }
+ }
+ } else {
+ if (--r < 0)
+ break; /* reached target zero coefficient */
+ }
+ k++;
+ } while (k <= Se);
+ if (s) {
+ int pos = jpeg_natural_order[k];
+ /* Output newly nonzero coefficient */
+ (*block)[pos] = (JCOEF)s;
+ /* Remember its position in case we have to suspend */
+ newnz_pos[num_newnz++] = pos;
+ }
+ }
+ }
+
+ if (EOBRUN > 0) {
+ /* Scan any remaining coefficient positions after the end-of-band
+ * (the last newly nonzero coefficient, if any). Append a correction
+ * bit to each already-nonzero coefficient. A correction bit is 1
+ * if the absolute value of the coefficient must be increased.
+ */
+ for (; k <= Se; k++) {
+ thiscoef = *block + jpeg_natural_order[k];
+ if (*thiscoef != 0) {
+ CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+ if (GET_BITS(1)) {
+ if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
+ if (*thiscoef >= 0)
+ *thiscoef += (JCOEF)p1;
+ else
+ *thiscoef += (JCOEF)m1;
+ }
+ }
+ }
+ }
+ /* Count one block completed in EOB run */
+ EOBRUN--;
+ }
+
+ /* Completed MCU, so update state */
+ BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+ entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
+ }
+
+ /* Account for restart interval (no-op if not using restarts) */
+ if (cinfo->restart_interval)
+ entropy->restarts_to_go--;
+
+ return TRUE;
+
+undoit:
+ /* Re-zero any output coefficients that we made newly nonzero */
+ while (num_newnz > 0)
+ (*block)[newnz_pos[--num_newnz]] = 0;
+
+ return FALSE;
+}
+
+
+/*
+ * Module initialization routine for progressive Huffman entropy decoding.
+ */
+
+GLOBAL(void)
+jinit_phuff_decoder(j_decompress_ptr cinfo)
+{
+ phuff_entropy_ptr entropy;
+ int *coef_bit_ptr;
+ int ci, i;
+
+ entropy = (phuff_entropy_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(phuff_entropy_decoder));
+ cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
+ entropy->pub.start_pass = start_pass_phuff_decoder;
+
+ /* Mark derived tables unallocated */
+ for (i = 0; i < NUM_HUFF_TBLS; i++) {
+ entropy->derived_tbls[i] = NULL;
+ }
+
+ /* Create progression status table */
+ cinfo->coef_bits = (int (*)[DCTSIZE2])
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ cinfo->num_components * 2 * DCTSIZE2 *
+ sizeof(int));
+ coef_bit_ptr = &cinfo->coef_bits[0][0];
+ for (ci = 0; ci < cinfo->num_components; ci++)
+ for (i = 0; i < DCTSIZE2; i++)
+ *coef_bit_ptr++ = -1;
+}
+
+#endif /* D_PROGRESSIVE_SUPPORTED */
diff --git a/media/libjpeg/jdpostct.c b/media/libjpeg/jdpostct.c
new file mode 100644
index 0000000000..6a2cf5c1b3
--- /dev/null
+++ b/media/libjpeg/jdpostct.c
@@ -0,0 +1,294 @@
+/*
+ * jdpostct.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the decompression postprocessing controller.
+ * This controller manages the upsampling, color conversion, and color
+ * quantization/reduction steps; specifically, it controls the buffering
+ * between upsample/color conversion and color quantization/reduction.
+ *
+ * If no color quantization/reduction is required, then this module has no
+ * work to do, and it just hands off to the upsample/color conversion code.
+ * An integrated upsample/convert/quantize process would replace this module
+ * entirely.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Private buffer controller object */
+
+typedef struct {
+ struct jpeg_d_post_controller pub; /* public fields */
+
+ /* Color quantization source buffer: this holds output data from
+ * the upsample/color conversion step to be passed to the quantizer.
+ * For two-pass color quantization, we need a full-image buffer;
+ * for one-pass operation, a strip buffer is sufficient.
+ */
+ jvirt_sarray_ptr whole_image; /* virtual array, or NULL if one-pass */
+ JSAMPARRAY buffer; /* strip buffer, or current strip of virtual */
+ JDIMENSION strip_height; /* buffer size in rows */
+ /* for two-pass mode only: */
+ JDIMENSION starting_row; /* row # of first row in current strip */
+ JDIMENSION next_row; /* index of next row to fill/empty in strip */
+} my_post_controller;
+
+typedef my_post_controller *my_post_ptr;
+
+
+/* Forward declarations */
+METHODDEF(void) post_process_1pass(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail,
+ JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
+#ifdef QUANT_2PASS_SUPPORTED
+METHODDEF(void) post_process_prepass(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail,
+ JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_2pass(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail,
+ JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
+#endif
+
+
+/*
+ * Initialize for a processing pass.
+ */
+
+METHODDEF(void)
+start_pass_dpost(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
+{
+ my_post_ptr post = (my_post_ptr)cinfo->post;
+
+ switch (pass_mode) {
+ case JBUF_PASS_THRU:
+ if (cinfo->quantize_colors) {
+ /* Single-pass processing with color quantization. */
+ post->pub.post_process_data = post_process_1pass;
+ /* We could be doing buffered-image output before starting a 2-pass
+ * color quantization; in that case, jinit_d_post_controller did not
+ * allocate a strip buffer. Use the virtual-array buffer as workspace.
+ */
+ if (post->buffer == NULL) {
+ post->buffer = (*cinfo->mem->access_virt_sarray)
+ ((j_common_ptr)cinfo, post->whole_image,
+ (JDIMENSION)0, post->strip_height, TRUE);
+ }
+ } else {
+ /* For single-pass processing without color quantization,
+ * I have no work to do; just call the upsampler directly.
+ */
+ post->pub.post_process_data = cinfo->upsample->upsample;
+ }
+ break;
+#ifdef QUANT_2PASS_SUPPORTED
+ case JBUF_SAVE_AND_PASS:
+ /* First pass of 2-pass quantization */
+ if (post->whole_image == NULL)
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+ post->pub.post_process_data = post_process_prepass;
+ break;
+ case JBUF_CRANK_DEST:
+ /* Second pass of 2-pass quantization */
+ if (post->whole_image == NULL)
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+ post->pub.post_process_data = post_process_2pass;
+ break;
+#endif /* QUANT_2PASS_SUPPORTED */
+ default:
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+ break;
+ }
+ post->starting_row = post->next_row = 0;
+}
+
+
+/*
+ * Process some data in the one-pass (strip buffer) case.
+ * This is used for color precision reduction as well as one-pass quantization.
+ */
+
+METHODDEF(void)
+post_process_1pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
+{
+ my_post_ptr post = (my_post_ptr)cinfo->post;
+ JDIMENSION num_rows, max_rows;
+
+ /* Fill the buffer, but not more than what we can dump out in one go. */
+ /* Note we rely on the upsampler to detect bottom of image. */
+ max_rows = out_rows_avail - *out_row_ctr;
+ if (max_rows > post->strip_height)
+ max_rows = post->strip_height;
+ num_rows = 0;
+ (*cinfo->upsample->upsample) (cinfo, input_buf, in_row_group_ctr,
+ in_row_groups_avail, post->buffer, &num_rows,
+ max_rows);
+ /* Quantize and emit data. */
+ (*cinfo->cquantize->color_quantize) (cinfo, post->buffer,
+ output_buf + *out_row_ctr,
+ (int)num_rows);
+ *out_row_ctr += num_rows;
+}
+
+
+#ifdef QUANT_2PASS_SUPPORTED
+
+/*
+ * Process some data in the first pass of 2-pass quantization.
+ */
+
+METHODDEF(void)
+post_process_prepass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
+{
+ my_post_ptr post = (my_post_ptr)cinfo->post;
+ JDIMENSION old_next_row, num_rows;
+
+ /* Reposition virtual buffer if at start of strip. */
+ if (post->next_row == 0) {
+ post->buffer = (*cinfo->mem->access_virt_sarray)
+ ((j_common_ptr)cinfo, post->whole_image,
+ post->starting_row, post->strip_height, TRUE);
+ }
+
+ /* Upsample some data (up to a strip height's worth). */
+ old_next_row = post->next_row;
+ (*cinfo->upsample->upsample) (cinfo, input_buf, in_row_group_ctr,
+ in_row_groups_avail, post->buffer,
+ &post->next_row, post->strip_height);
+
+ /* Allow quantizer to scan new data. No data is emitted, */
+ /* but we advance out_row_ctr so outer loop can tell when we're done. */
+ if (post->next_row > old_next_row) {
+ num_rows = post->next_row - old_next_row;
+ (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + old_next_row,
+ (JSAMPARRAY)NULL, (int)num_rows);
+ *out_row_ctr += num_rows;
+ }
+
+ /* Advance if we filled the strip. */
+ if (post->next_row >= post->strip_height) {
+ post->starting_row += post->strip_height;
+ post->next_row = 0;
+ }
+}
+
+
+/*
+ * Process some data in the second pass of 2-pass quantization.
+ */
+
+METHODDEF(void)
+post_process_2pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
+{
+ my_post_ptr post = (my_post_ptr)cinfo->post;
+ JDIMENSION num_rows, max_rows;
+
+ /* Reposition virtual buffer if at start of strip. */
+ if (post->next_row == 0) {
+ post->buffer = (*cinfo->mem->access_virt_sarray)
+ ((j_common_ptr)cinfo, post->whole_image,
+ post->starting_row, post->strip_height, FALSE);
+ }
+
+ /* Determine number of rows to emit. */
+ num_rows = post->strip_height - post->next_row; /* available in strip */
+ max_rows = out_rows_avail - *out_row_ctr; /* available in output area */
+ if (num_rows > max_rows)
+ num_rows = max_rows;
+ /* We have to check bottom of image here, can't depend on upsampler. */
+ max_rows = cinfo->output_height - post->starting_row;
+ if (num_rows > max_rows)
+ num_rows = max_rows;
+
+ /* Quantize and emit data. */
+ (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + post->next_row,
+ output_buf + *out_row_ctr,
+ (int)num_rows);
+ *out_row_ctr += num_rows;
+
+ /* Advance if we filled the strip. */
+ post->next_row += num_rows;
+ if (post->next_row >= post->strip_height) {
+ post->starting_row += post->strip_height;
+ post->next_row = 0;
+ }
+}
+
+#endif /* QUANT_2PASS_SUPPORTED */
+
+
+/*
+ * Initialize postprocessing controller.
+ */
+
+GLOBAL(void)
+jinit_d_post_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
+{
+ my_post_ptr post;
+
+ post = (my_post_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_post_controller));
+ cinfo->post = (struct jpeg_d_post_controller *)post;
+ post->pub.start_pass = start_pass_dpost;
+ post->whole_image = NULL; /* flag for no virtual arrays */
+ post->buffer = NULL; /* flag for no strip buffer */
+
+ /* Create the quantization buffer, if needed */
+ if (cinfo->quantize_colors) {
+ /* The buffer strip height is max_v_samp_factor, which is typically
+ * an efficient number of rows for upsampling to return.
+ * (In the presence of output rescaling, we might want to be smarter?)
+ */
+ post->strip_height = (JDIMENSION)cinfo->max_v_samp_factor;
+ if (need_full_buffer) {
+ /* Two-pass color quantization: need full-image storage. */
+ /* We round up the number of rows to a multiple of the strip height. */
+#ifdef QUANT_2PASS_SUPPORTED
+ post->whole_image = (*cinfo->mem->request_virt_sarray)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+ cinfo->output_width * cinfo->out_color_components,
+ (JDIMENSION)jround_up((long)cinfo->output_height,
+ (long)post->strip_height),
+ post->strip_height);
+#else
+ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+#endif /* QUANT_2PASS_SUPPORTED */
+ } else {
+ /* One-pass color quantization: just make a strip buffer. */
+ post->buffer = (*cinfo->mem->alloc_sarray)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ cinfo->output_width * cinfo->out_color_components,
+ post->strip_height);
+ }
+ }
+}
diff --git a/media/libjpeg/jdsample.c b/media/libjpeg/jdsample.c
new file mode 100644
index 0000000000..eaad72a030
--- /dev/null
+++ b/media/libjpeg/jdsample.c
@@ -0,0 +1,524 @@
+/*
+ * jdsample.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2019-2020, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains upsampling routines.
+ *
+ * Upsampling input data is counted in "row groups". A row group
+ * is defined to be (v_samp_factor * DCT_scaled_size / min_DCT_scaled_size)
+ * sample rows of each component. Upsampling will normally produce
+ * max_v_samp_factor pixel rows from each row group (but this could vary
+ * if the upsampler is applying a scale factor of its own).
+ *
+ * An excellent reference for image resampling is
+ * Digital Image Warping, George Wolberg, 1990.
+ * Pub. by IEEE Computer Society Press, Los Alamitos, CA. ISBN 0-8186-8944-7.
+ */
+
+#include "jinclude.h"
+#include "jdsample.h"
+#include "jsimd.h"
+#include "jpegcomp.h"
+
+
+
+/*
+ * Initialize for an upsampling pass.
+ */
+
+METHODDEF(void)
+start_pass_upsample(j_decompress_ptr cinfo)
+{
+ my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
+
+ /* Mark the conversion buffer empty */
+ upsample->next_row_out = cinfo->max_v_samp_factor;
+ /* Initialize total-height counter for detecting bottom of image */
+ upsample->rows_to_go = cinfo->output_height;
+}
+
+
+/*
+ * Control routine to do upsampling (and color conversion).
+ *
+ * In this version we upsample each component independently.
+ * We upsample one row group into the conversion buffer, then apply
+ * color conversion a row at a time.
+ */
+
+METHODDEF(void)
+sep_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
+ JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail)
+{
+ my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
+ int ci;
+ jpeg_component_info *compptr;
+ JDIMENSION num_rows;
+
+ /* Fill the conversion buffer, if it's empty */
+ if (upsample->next_row_out >= cinfo->max_v_samp_factor) {
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ /* Invoke per-component upsample method. Notice we pass a POINTER
+ * to color_buf[ci], so that fullsize_upsample can change it.
+ */
+ (*upsample->methods[ci]) (cinfo, compptr,
+ input_buf[ci] + (*in_row_group_ctr * upsample->rowgroup_height[ci]),
+ upsample->color_buf + ci);
+ }
+ upsample->next_row_out = 0;
+ }
+
+ /* Color-convert and emit rows */
+
+ /* How many we have in the buffer: */
+ num_rows = (JDIMENSION)(cinfo->max_v_samp_factor - upsample->next_row_out);
+ /* Not more than the distance to the end of the image. Need this test
+ * in case the image height is not a multiple of max_v_samp_factor:
+ */
+ if (num_rows > upsample->rows_to_go)
+ num_rows = upsample->rows_to_go;
+ /* And not more than what the client can accept: */
+ out_rows_avail -= *out_row_ctr;
+ if (num_rows > out_rows_avail)
+ num_rows = out_rows_avail;
+
+ (*cinfo->cconvert->color_convert) (cinfo, upsample->color_buf,
+ (JDIMENSION)upsample->next_row_out,
+ output_buf + *out_row_ctr, (int)num_rows);
+
+ /* Adjust counts */
+ *out_row_ctr += num_rows;
+ upsample->rows_to_go -= num_rows;
+ upsample->next_row_out += num_rows;
+ /* When the buffer is emptied, declare this input row group consumed */
+ if (upsample->next_row_out >= cinfo->max_v_samp_factor)
+ (*in_row_group_ctr)++;
+}
+
+
+/*
+ * These are the routines invoked by sep_upsample to upsample pixel values
+ * of a single component. One row group is processed per call.
+ */
+
+
+/*
+ * For full-size components, we just make color_buf[ci] point at the
+ * input buffer, and thus avoid copying any data. Note that this is
+ * safe only because sep_upsample doesn't declare the input row group
+ * "consumed" until we are done color converting and emitting it.
+ */
+
+METHODDEF(void)
+fullsize_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ *output_data_ptr = input_data;
+}
+
+
+/*
+ * This is a no-op version used for "uninteresting" components.
+ * These components will not be referenced by color conversion.
+ */
+
+METHODDEF(void)
+noop_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ *output_data_ptr = NULL; /* safety check */
+}
+
+
+/*
+ * This version handles any integral sampling ratios.
+ * This is not used for typical JPEG files, so it need not be fast.
+ * Nor, for that matter, is it particularly accurate: the algorithm is
+ * simple replication of the input pixel onto the corresponding output
+ * pixels. The hi-falutin sampling literature refers to this as a
+ * "box filter". A box filter tends to introduce visible artifacts,
+ * so if you are actually going to use 3:1 or 4:1 sampling ratios
+ * you would be well advised to improve this code.
+ */
+
+METHODDEF(void)
+int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
+ JSAMPARRAY output_data = *output_data_ptr;
+ register JSAMPROW inptr, outptr;
+ register JSAMPLE invalue;
+ register int h;
+ JSAMPROW outend;
+ int h_expand, v_expand;
+ int inrow, outrow;
+
+ h_expand = upsample->h_expand[compptr->component_index];
+ v_expand = upsample->v_expand[compptr->component_index];
+
+ inrow = outrow = 0;
+ while (outrow < cinfo->max_v_samp_factor) {
+ /* Generate one output row with proper horizontal expansion */
+ inptr = input_data[inrow];
+ outptr = output_data[outrow];
+ outend = outptr + cinfo->output_width;
+ while (outptr < outend) {
+ invalue = *inptr++;
+ for (h = h_expand; h > 0; h--) {
+ *outptr++ = invalue;
+ }
+ }
+ /* Generate any additional output rows by duplicating the first one */
+ if (v_expand > 1) {
+ jcopy_sample_rows(output_data, outrow, output_data, outrow + 1,
+ v_expand - 1, cinfo->output_width);
+ }
+ inrow++;
+ outrow += v_expand;
+ }
+}
+
+
+/*
+ * Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+ * It's still a box filter.
+ */
+
+METHODDEF(void)
+h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ register JSAMPROW inptr, outptr;
+ register JSAMPLE invalue;
+ JSAMPROW outend;
+ int inrow;
+
+ for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+ outend = outptr + cinfo->output_width;
+ while (outptr < outend) {
+ invalue = *inptr++;
+ *outptr++ = invalue;
+ *outptr++ = invalue;
+ }
+ }
+}
+
+
+/*
+ * Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+ * It's still a box filter.
+ */
+
+METHODDEF(void)
+h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ register JSAMPROW inptr, outptr;
+ register JSAMPLE invalue;
+ JSAMPROW outend;
+ int inrow, outrow;
+
+ inrow = outrow = 0;
+ while (outrow < cinfo->max_v_samp_factor) {
+ inptr = input_data[inrow];
+ outptr = output_data[outrow];
+ outend = outptr + cinfo->output_width;
+ while (outptr < outend) {
+ invalue = *inptr++;
+ *outptr++ = invalue;
+ *outptr++ = invalue;
+ }
+ jcopy_sample_rows(output_data, outrow, output_data, outrow + 1, 1,
+ cinfo->output_width);
+ inrow++;
+ outrow += 2;
+ }
+}
+
+
+/*
+ * Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+ *
+ * The upsampling algorithm is linear interpolation between pixel centers,
+ * also known as a "triangle filter". This is a good compromise between
+ * speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+ * of the way between input pixel centers.
+ *
+ * A note about the "bias" calculations: when rounding fractional values to
+ * integer, we do not want to always round 0.5 up to the next integer.
+ * If we did that, we'd introduce a noticeable bias towards larger values.
+ * Instead, this code is arranged so that 0.5 will be rounded up or down at
+ * alternate pixel locations (a simple ordered dither pattern).
+ */
+
+METHODDEF(void)
+h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ register JSAMPROW inptr, outptr;
+ register int invalue;
+ register JDIMENSION colctr;
+ int inrow;
+
+ for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+ /* Special case for first column */
+ invalue = *inptr++;
+ *outptr++ = (JSAMPLE)invalue;
+ *outptr++ = (JSAMPLE)((invalue * 3 + inptr[0] + 2) >> 2);
+
+ for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
+ /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
+ invalue = (*inptr++) * 3;
+ *outptr++ = (JSAMPLE)((invalue + inptr[-2] + 1) >> 2);
+ *outptr++ = (JSAMPLE)((invalue + inptr[0] + 2) >> 2);
+ }
+
+ /* Special case for last column */
+ invalue = *inptr;
+ *outptr++ = (JSAMPLE)((invalue * 3 + inptr[-1] + 1) >> 2);
+ *outptr++ = (JSAMPLE)invalue;
+ }
+}
+
+
+/*
+ * Fancy processing for 1:1 horizontal and 2:1 vertical (4:4:0 subsampling).
+ *
+ * This is a less common case, but it can be encountered when losslessly
+ * rotating/transposing a JPEG file that uses 4:2:2 chroma subsampling.
+ */
+
+METHODDEF(void)
+h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, inptr1, outptr;
+#if BITS_IN_JSAMPLE == 8
+ int thiscolsum, bias;
+#else
+ JLONG thiscolsum, bias;
+#endif
+ JDIMENSION colctr;
+ int inrow, outrow, v;
+
+ inrow = outrow = 0;
+ while (outrow < cinfo->max_v_samp_factor) {
+ for (v = 0; v < 2; v++) {
+ /* inptr0 points to nearest input row, inptr1 points to next nearest */
+ inptr0 = input_data[inrow];
+ if (v == 0) { /* next nearest is row above */
+ inptr1 = input_data[inrow - 1];
+ bias = 1;
+ } else { /* next nearest is row below */
+ inptr1 = input_data[inrow + 1];
+ bias = 2;
+ }
+ outptr = output_data[outrow++];
+
+ for (colctr = 0; colctr < compptr->downsampled_width; colctr++) {
+ thiscolsum = (*inptr0++) * 3 + (*inptr1++);
+ *outptr++ = (JSAMPLE)((thiscolsum + bias) >> 2);
+ }
+ }
+ inrow++;
+ }
+}
+
+
+/*
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+ * Again a triangle filter; see comments for h2v1 case, above.
+ *
+ * It is OK for us to reference the adjacent input rows because we demanded
+ * context from the main buffer controller (see initialization code).
+ */
+
+METHODDEF(void)
+h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ register JSAMPROW inptr0, inptr1, outptr;
+#if BITS_IN_JSAMPLE == 8
+ register int thiscolsum, lastcolsum, nextcolsum;
+#else
+ register JLONG thiscolsum, lastcolsum, nextcolsum;
+#endif
+ register JDIMENSION colctr;
+ int inrow, outrow, v;
+
+ inrow = outrow = 0;
+ while (outrow < cinfo->max_v_samp_factor) {
+ for (v = 0; v < 2; v++) {
+ /* inptr0 points to nearest input row, inptr1 points to next nearest */
+ inptr0 = input_data[inrow];
+ if (v == 0) /* next nearest is row above */
+ inptr1 = input_data[inrow - 1];
+ else /* next nearest is row below */
+ inptr1 = input_data[inrow + 1];
+ outptr = output_data[outrow++];
+
+ /* Special case for first column */
+ thiscolsum = (*inptr0++) * 3 + (*inptr1++);
+ nextcolsum = (*inptr0++) * 3 + (*inptr1++);
+ *outptr++ = (JSAMPLE)((thiscolsum * 4 + 8) >> 4);
+ *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
+ lastcolsum = thiscolsum; thiscolsum = nextcolsum;
+
+ for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
+ /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
+ /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
+ nextcolsum = (*inptr0++) * 3 + (*inptr1++);
+ *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
+ *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
+ lastcolsum = thiscolsum; thiscolsum = nextcolsum;
+ }
+
+ /* Special case for last column */
+ *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
+ *outptr++ = (JSAMPLE)((thiscolsum * 4 + 7) >> 4);
+ }
+ inrow++;
+ }
+}
+
+
+/*
+ * Module initialization routine for upsampling.
+ */
+
+GLOBAL(void)
+jinit_upsampler(j_decompress_ptr cinfo)
+{
+ my_upsample_ptr upsample;
+ int ci;
+ jpeg_component_info *compptr;
+ boolean need_buffer, do_fancy;
+ int h_in_group, v_in_group, h_out_group, v_out_group;
+
+ if (!cinfo->master->jinit_upsampler_no_alloc) {
+ upsample = (my_upsample_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_upsampler));
+ cinfo->upsample = (struct jpeg_upsampler *)upsample;
+ upsample->pub.start_pass = start_pass_upsample;
+ upsample->pub.upsample = sep_upsample;
+ upsample->pub.need_context_rows = FALSE; /* until we find out differently */
+ } else
+ upsample = (my_upsample_ptr)cinfo->upsample;
+
+ if (cinfo->CCIR601_sampling) /* this isn't supported */
+ ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);
+
+ /* jdmainct.c doesn't support context rows when min_DCT_scaled_size = 1,
+ * so don't ask for it.
+ */
+ do_fancy = cinfo->do_fancy_upsampling && cinfo->_min_DCT_scaled_size > 1;
+
+ /* Verify we can handle the sampling factors, select per-component methods,
+ * and create storage as needed.
+ */
+ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ ci++, compptr++) {
+ /* Compute size of an "input group" after IDCT scaling. This many samples
+ * are to be converted to max_h_samp_factor * max_v_samp_factor pixels.
+ */
+ h_in_group = (compptr->h_samp_factor * compptr->_DCT_scaled_size) /
+ cinfo->_min_DCT_scaled_size;
+ v_in_group = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+ cinfo->_min_DCT_scaled_size;
+ h_out_group = cinfo->max_h_samp_factor;
+ v_out_group = cinfo->max_v_samp_factor;
+ upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
+ need_buffer = TRUE;
+ if (!compptr->component_needed) {
+ /* Don't bother to upsample an uninteresting component. */
+ upsample->methods[ci] = noop_upsample;
+ need_buffer = FALSE;
+ } else if (h_in_group == h_out_group && v_in_group == v_out_group) {
+ /* Fullsize components can be processed without any work. */
+ upsample->methods[ci] = fullsize_upsample;
+ need_buffer = FALSE;
+ } else if (h_in_group * 2 == h_out_group && v_in_group == v_out_group) {
+ /* Special cases for 2h1v upsampling */
+ if (do_fancy && compptr->downsampled_width > 2) {
+ if (jsimd_can_h2v1_fancy_upsample())
+ upsample->methods[ci] = jsimd_h2v1_fancy_upsample;
+ else
+ upsample->methods[ci] = h2v1_fancy_upsample;
+ } else {
+ if (jsimd_can_h2v1_upsample())
+ upsample->methods[ci] = jsimd_h2v1_upsample;
+ else
+ upsample->methods[ci] = h2v1_upsample;
+ }
+ } else if (h_in_group == h_out_group &&
+ v_in_group * 2 == v_out_group && do_fancy) {
+ /* Non-fancy upsampling is handled by the generic method */
+#if defined(__arm__) || defined(__aarch64__) || \
+ defined(_M_ARM) || defined(_M_ARM64)
+ if (jsimd_can_h1v2_fancy_upsample())
+ upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
+ else
+#endif
+ upsample->methods[ci] = h1v2_fancy_upsample;
+ upsample->pub.need_context_rows = TRUE;
+ } else if (h_in_group * 2 == h_out_group &&
+ v_in_group * 2 == v_out_group) {
+ /* Special cases for 2h2v upsampling */
+ if (do_fancy && compptr->downsampled_width > 2) {
+ if (jsimd_can_h2v2_fancy_upsample())
+ upsample->methods[ci] = jsimd_h2v2_fancy_upsample;
+ else
+ upsample->methods[ci] = h2v2_fancy_upsample;
+ upsample->pub.need_context_rows = TRUE;
+ } else {
+ if (jsimd_can_h2v2_upsample())
+ upsample->methods[ci] = jsimd_h2v2_upsample;
+ else
+ upsample->methods[ci] = h2v2_upsample;
+ }
+ } else if ((h_out_group % h_in_group) == 0 &&
+ (v_out_group % v_in_group) == 0) {
+ /* Generic integral-factors upsampling method */
+#if defined(__mips__)
+ if (jsimd_can_int_upsample())
+ upsample->methods[ci] = jsimd_int_upsample;
+ else
+#endif
+ upsample->methods[ci] = int_upsample;
+ upsample->h_expand[ci] = (UINT8)(h_out_group / h_in_group);
+ upsample->v_expand[ci] = (UINT8)(v_out_group / v_in_group);
+ } else
+ ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
+ if (need_buffer && !cinfo->master->jinit_upsampler_no_alloc) {
+ upsample->color_buf[ci] = (*cinfo->mem->alloc_sarray)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (JDIMENSION)jround_up((long)cinfo->output_width,
+ (long)cinfo->max_h_samp_factor),
+ (JDIMENSION)cinfo->max_v_samp_factor);
+ }
+ }
+}
diff --git a/media/libjpeg/jdsample.h b/media/libjpeg/jdsample.h
new file mode 100644
index 0000000000..a6bf08a032
--- /dev/null
+++ b/media/libjpeg/jdsample.h
@@ -0,0 +1,50 @@
+/*
+ * jdsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#define JPEG_INTERNALS
+#include "jpeglib.h"
+
+
+/* Pointer to routine to upsample a single component */
+typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+/* Private subobject */
+
+typedef struct {
+ struct jpeg_upsampler pub; /* public fields */
+
+ /* Color conversion buffer. When using separate upsampling and color
+ * conversion steps, this buffer holds one upsampled row group until it
+ * has been color converted and output.
+ * Note: we do not allocate any storage for component(s) which are full-size,
+ * ie do not need rescaling. The corresponding entry of color_buf[] is
+ * simply set to point to the input data array, thereby avoiding copying.
+ */
+ JSAMPARRAY color_buf[MAX_COMPONENTS];
+
+ /* Per-component upsampling method pointers */
+ upsample1_ptr methods[MAX_COMPONENTS];
+
+ int next_row_out; /* counts rows emitted from color_buf */
+ JDIMENSION rows_to_go; /* counts rows remaining in image */
+
+ /* Height of an input row group for each component. */
+ int rowgroup_height[MAX_COMPONENTS];
+
+ /* These arrays save pixel expansion factors so that int_expand need not
+ * recompute them each time. They are unused for other upsampling methods.
+ */
+ UINT8 h_expand[MAX_COMPONENTS];
+ UINT8 v_expand[MAX_COMPONENTS];
+} my_upsampler;
+
+typedef my_upsampler *my_upsample_ptr;
diff --git a/media/libjpeg/jdtrans.c b/media/libjpeg/jdtrans.c
new file mode 100644
index 0000000000..d7ec4b83b3
--- /dev/null
+++ b/media/libjpeg/jdtrans.c
@@ -0,0 +1,156 @@
+/*
+ * jdtrans.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1995-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains library routines for transcoding decompression,
+ * that is, reading raw DCT coefficient arrays from an input JPEG file.
+ * The routines in jdapimin.c will also be needed by a transcoder.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jpegcomp.h"
+
+
+/* Forward declarations */
+LOCAL(void) transdecode_master_selection(j_decompress_ptr cinfo);
+
+
+/*
+ * Read the coefficient arrays from a JPEG file.
+ * jpeg_read_header must be completed before calling this.
+ *
+ * The entire image is read into a set of virtual coefficient-block arrays,
+ * one per component. The return value is a pointer to the array of
+ * virtual-array descriptors. These can be manipulated directly via the
+ * JPEG memory manager, or handed off to jpeg_write_coefficients().
+ * To release the memory occupied by the virtual arrays, call
+ * jpeg_finish_decompress() when done with the data.
+ *
+ * An alternative usage is to simply obtain access to the coefficient arrays
+ * during a buffered-image-mode decompression operation. This is allowed
+ * after any jpeg_finish_output() call. The arrays can be accessed until
+ * jpeg_finish_decompress() is called. (Note that any call to the library
+ * may reposition the arrays, so don't rely on access_virt_barray() results
+ * to stay valid across library calls.)
+ *
+ * Returns NULL if suspended. This case need be checked only if
+ * a suspending data source is used.
+ */
+
+GLOBAL(jvirt_barray_ptr *)
+jpeg_read_coefficients(j_decompress_ptr cinfo)
+{
+ if (cinfo->global_state == DSTATE_READY) {
+ /* First call: initialize active modules */
+ transdecode_master_selection(cinfo);
+ cinfo->global_state = DSTATE_RDCOEFS;
+ }
+ if (cinfo->global_state == DSTATE_RDCOEFS) {
+ /* Absorb whole file into the coef buffer */
+ for (;;) {
+ int retcode;
+ /* Call progress monitor hook if present */
+ if (cinfo->progress != NULL)
+ (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
+ /* Absorb some more input */
+ retcode = (*cinfo->inputctl->consume_input) (cinfo);
+ if (retcode == JPEG_SUSPENDED)
+ return NULL;
+ if (retcode == JPEG_REACHED_EOI)
+ break;
+ /* Advance progress counter if appropriate */
+ if (cinfo->progress != NULL &&
+ (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
+ if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
+ /* startup underestimated number of scans; ratchet up one scan */
+ cinfo->progress->pass_limit += (long)cinfo->total_iMCU_rows;
+ }
+ }
+ }
+ /* Set state so that jpeg_finish_decompress does the right thing */
+ cinfo->global_state = DSTATE_STOPPING;
+ }
+ /* At this point we should be in state DSTATE_STOPPING if being used
+ * standalone, or in state DSTATE_BUFIMAGE if being invoked to get access
+ * to the coefficients during a full buffered-image-mode decompression.
+ */
+ if ((cinfo->global_state == DSTATE_STOPPING ||
+ cinfo->global_state == DSTATE_BUFIMAGE) && cinfo->buffered_image) {
+ return cinfo->coef->coef_arrays;
+ }
+ /* Oops, improper usage */
+ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+ return NULL; /* keep compiler happy */
+}
+
+
+/*
+ * Master selection of decompression modules for transcoding.
+ * This substitutes for jdmaster.c's initialization of the full decompressor.
+ */
+
+LOCAL(void)
+transdecode_master_selection(j_decompress_ptr cinfo)
+{
+ /* This is effectively a buffered-image operation. */
+ cinfo->buffered_image = TRUE;
+
+#if JPEG_LIB_VERSION >= 80
+ /* Compute output image dimensions and related values. */
+ jpeg_core_output_dimensions(cinfo);
+#endif
+
+ /* Entropy decoding: either Huffman or arithmetic coding. */
+ if (cinfo->arith_code) {
+#ifdef D_ARITH_CODING_SUPPORTED
+ jinit_arith_decoder(cinfo);
+#else
+ ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+#endif
+ } else {
+ if (cinfo->progressive_mode) {
+#ifdef D_PROGRESSIVE_SUPPORTED
+ jinit_phuff_decoder(cinfo);
+#else
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+ } else
+ jinit_huff_decoder(cinfo);
+ }
+
+ /* Always get a full-image coefficient buffer. */
+ jinit_d_coef_controller(cinfo, TRUE);
+
+ /* We can now tell the memory manager to allocate virtual arrays. */
+ (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
+
+ /* Initialize input side of decompressor to consume first scan. */
+ (*cinfo->inputctl->start_input_pass) (cinfo);
+
+ /* Initialize progress monitoring. */
+ if (cinfo->progress != NULL) {
+ int nscans;
+ /* Estimate number of scans to set pass_limit. */
+ if (cinfo->progressive_mode) {
+ /* Arbitrarily estimate 2 interleaved DC scans + 3 AC scans/component. */
+ nscans = 2 + 3 * cinfo->num_components;
+ } else if (cinfo->inputctl->has_multiple_scans) {
+ /* For a nonprogressive multiscan file, estimate 1 scan per component. */
+ nscans = cinfo->num_components;
+ } else {
+ nscans = 1;
+ }
+ cinfo->progress->pass_counter = 0L;
+ cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows * nscans;
+ cinfo->progress->completed_passes = 0;
+ cinfo->progress->total_passes = 1;
+ }
+}
diff --git a/media/libjpeg/jerror.c b/media/libjpeg/jerror.c
new file mode 100644
index 0000000000..d0ab5b88b0
--- /dev/null
+++ b/media/libjpeg/jerror.c
@@ -0,0 +1,251 @@
+/*
+ * jerror.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains simple error-reporting and trace-message routines.
+ * These are suitable for Unix-like systems and others where writing to
+ * stderr is the right thing to do. Many applications will want to replace
+ * some or all of these routines.
+ *
+ * If you define USE_WINDOWS_MESSAGEBOX in jconfig.h or in the makefile,
+ * you get a Windows-specific hack to display error messages in a dialog box.
+ * It ain't much, but it beats dropping error messages into the bit bucket,
+ * which is what happens to output to stderr under most Windows C compilers.
+ *
+ * These routines are used by both the compression and decompression code.
+ */
+
+/* this is not a core library module, so it doesn't define JPEG_INTERNALS */
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jversion.h"
+#include "jerror.h"
+
+#ifdef USE_WINDOWS_MESSAGEBOX
+#include <windows.h>
+#endif
+
+#ifndef EXIT_FAILURE /* define exit() codes if not provided */
+#define EXIT_FAILURE 1
+#endif
+
+
+/*
+ * Create the message string table.
+ * We do this from the master message list in jerror.h by re-reading
+ * jerror.h with a suitable definition for macro JMESSAGE.
+ * The message table is made an external symbol just in case any applications
+ * want to refer to it directly.
+ */
+
+#define JMESSAGE(code, string) string,
+
+const char * const jpeg_std_message_table[] = {
+#include "jerror.h"
+ NULL
+};
+
+
+/*
+ * Error exit handler: must not return to caller.
+ *
+ * Applications may override this if they want to get control back after
+ * an error. Typically one would longjmp somewhere instead of exiting.
+ * The setjmp buffer can be made a private field within an expanded error
+ * handler object. Note that the info needed to generate an error message
+ * is stored in the error object, so you can generate the message now or
+ * later, at your convenience.
+ * You should make sure that the JPEG object is cleaned up (with jpeg_abort
+ * or jpeg_destroy) at some point.
+ */
+
+METHODDEF(void)
+error_exit(j_common_ptr cinfo)
+{
+ /* Always display the message */
+ (*cinfo->err->output_message) (cinfo);
+
+ /* Let the memory manager delete any temp files before we die */
+ jpeg_destroy(cinfo);
+
+ exit(EXIT_FAILURE);
+}
+
+
+/*
+ * Actual output of an error or trace message.
+ * Applications may override this method to send JPEG messages somewhere
+ * other than stderr.
+ *
+ * On Windows, printing to stderr is generally completely useless,
+ * so we provide optional code to produce an error-dialog popup.
+ * Most Windows applications will still prefer to override this routine,
+ * but if they don't, it'll do something at least marginally useful.
+ *
+ * NOTE: to use the library in an environment that doesn't support the
+ * C stdio library, you may have to delete the call to fprintf() entirely,
+ * not just not use this routine.
+ */
+
+METHODDEF(void)
+output_message(j_common_ptr cinfo)
+{
+ char buffer[JMSG_LENGTH_MAX];
+
+ /* Create the message */
+ (*cinfo->err->format_message) (cinfo, buffer);
+
+#ifdef USE_WINDOWS_MESSAGEBOX
+ /* Display it in a message dialog box */
+ MessageBox(GetActiveWindow(), buffer, "JPEG Library Error",
+ MB_OK | MB_ICONERROR);
+#else
+ /* Send it to stderr, adding a newline */
+ fprintf(stderr, "%s\n", buffer);
+#endif
+}
+
+
+/*
+ * Decide whether to emit a trace or warning message.
+ * msg_level is one of:
+ * -1: recoverable corrupt-data warning, may want to abort.
+ * 0: important advisory messages (always display to user).
+ * 1: first level of tracing detail.
+ * 2,3,...: successively more detailed tracing messages.
+ * An application might override this method if it wanted to abort on warnings
+ * or change the policy about which messages to display.
+ */
+
+METHODDEF(void)
+emit_message(j_common_ptr cinfo, int msg_level)
+{
+ struct jpeg_error_mgr *err = cinfo->err;
+
+ if (msg_level < 0) {
+ /* It's a warning message. Since corrupt files may generate many warnings,
+ * the policy implemented here is to show only the first warning,
+ * unless trace_level >= 3.
+ */
+ if (err->num_warnings == 0 || err->trace_level >= 3)
+ (*err->output_message) (cinfo);
+ /* Always count warnings in num_warnings. */
+ err->num_warnings++;
+ } else {
+ /* It's a trace message. Show it if trace_level >= msg_level. */
+ if (err->trace_level >= msg_level)
+ (*err->output_message) (cinfo);
+ }
+}
+
+
+/*
+ * Format a message string for the most recent JPEG error or message.
+ * The message is stored into buffer, which should be at least JMSG_LENGTH_MAX
+ * characters. Note that no '\n' character is added to the string.
+ * Few applications should need to override this method.
+ */
+
+METHODDEF(void)
+format_message(j_common_ptr cinfo, char *buffer)
+{
+ struct jpeg_error_mgr *err = cinfo->err;
+ int msg_code = err->msg_code;
+ const char *msgtext = NULL;
+ const char *msgptr;
+ char ch;
+ boolean isstring;
+
+ /* Look up message string in proper table */
+ if (msg_code > 0 && msg_code <= err->last_jpeg_message) {
+ msgtext = err->jpeg_message_table[msg_code];
+ } else if (err->addon_message_table != NULL &&
+ msg_code >= err->first_addon_message &&
+ msg_code <= err->last_addon_message) {
+ msgtext = err->addon_message_table[msg_code - err->first_addon_message];
+ }
+
+ /* Defend against bogus message number */
+ if (msgtext == NULL) {
+ err->msg_parm.i[0] = msg_code;
+ msgtext = err->jpeg_message_table[0];
+ }
+
+ /* Check for string parameter, as indicated by %s in the message text */
+ isstring = FALSE;
+ msgptr = msgtext;
+ while ((ch = *msgptr++) != '\0') {
+ if (ch == '%') {
+ if (*msgptr == 's') isstring = TRUE;
+ break;
+ }
+ }
+
+ /* Format the message into the passed buffer */
+ if (isstring)
+ SNPRINTF(buffer, JMSG_LENGTH_MAX, msgtext, err->msg_parm.s);
+ else
+ SNPRINTF(buffer, JMSG_LENGTH_MAX, msgtext,
+ err->msg_parm.i[0], err->msg_parm.i[1],
+ err->msg_parm.i[2], err->msg_parm.i[3],
+ err->msg_parm.i[4], err->msg_parm.i[5],
+ err->msg_parm.i[6], err->msg_parm.i[7]);
+}
+
+
+/*
+ * Reset error state variables at start of a new image.
+ * This is called during compression startup to reset trace/error
+ * processing to default state, without losing any application-specific
+ * method pointers. An application might possibly want to override
+ * this method if it has additional error processing state.
+ */
+
+METHODDEF(void)
+reset_error_mgr(j_common_ptr cinfo)
+{
+ cinfo->err->num_warnings = 0;
+ /* trace_level is not reset since it is an application-supplied parameter */
+ cinfo->err->msg_code = 0; /* may be useful as a flag for "no error" */
+}
+
+
+/*
+ * Fill in the standard error-handling methods in a jpeg_error_mgr object.
+ * Typical call is:
+ * struct jpeg_compress_struct cinfo;
+ * struct jpeg_error_mgr err;
+ *
+ * cinfo.err = jpeg_std_error(&err);
+ * after which the application may override some of the methods.
+ */
+
+GLOBAL(struct jpeg_error_mgr *)
+jpeg_std_error(struct jpeg_error_mgr *err)
+{
+ err->error_exit = error_exit;
+ err->emit_message = emit_message;
+ err->output_message = output_message;
+ err->format_message = format_message;
+ err->reset_error_mgr = reset_error_mgr;
+
+ err->trace_level = 0; /* default = no tracing */
+ err->num_warnings = 0; /* no warnings emitted yet */
+ err->msg_code = 0; /* may be useful as a flag for "no error" */
+
+ /* Initialize message table pointers */
+ err->jpeg_message_table = jpeg_std_message_table;
+ err->last_jpeg_message = (int)JMSG_LASTMSGCODE - 1;
+
+ err->addon_message_table = NULL;
+ err->first_addon_message = 0; /* for safety */
+ err->last_addon_message = 0;
+
+ return err;
+}
diff --git a/media/libjpeg/jerror.h b/media/libjpeg/jerror.h
new file mode 100644
index 0000000000..eb44a1140a
--- /dev/null
+++ b/media/libjpeg/jerror.h
@@ -0,0 +1,331 @@
+/*
+ * jerror.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2014, 2017, 2021-2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file defines the error and message codes for the JPEG library.
+ * Edit this file to add new codes, or to translate the message strings to
+ * some other language.
+ * A set of error-reporting macros are defined too. Some applications using
+ * the JPEG library may wish to include this file to get the error codes
+ * and/or the macros.
+ */
+
+/*
+ * To define the enum list of message codes, include this file without
+ * defining macro JMESSAGE. To create a message string table, include it
+ * again with a suitable JMESSAGE definition (see jerror.c for an example).
+ */
+#ifndef JMESSAGE
+#ifndef JERROR_H
+/* First time through, define the enum list */
+#define JMAKE_ENUM_LIST
+#else
+/* Repeated inclusions of this file are no-ops unless JMESSAGE is defined */
+#define JMESSAGE(code, string)
+#endif /* JERROR_H */
+#endif /* JMESSAGE */
+
+#ifdef JMAKE_ENUM_LIST
+
+typedef enum {
+
+#define JMESSAGE(code, string) code,
+
+#endif /* JMAKE_ENUM_LIST */
+
+JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */
+
+/* For maintenance convenience, list is alphabetical by message code name */
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_ARITH_NOTIMPL, "Sorry, arithmetic coding is not implemented")
+#endif
+JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
+JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
+JMESSAGE(JERR_BAD_BUFFER_MODE, "Bogus buffer control mode")
+JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS")
+#if JPEG_LIB_VERSION >= 70
+JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
+#endif
+JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range")
+JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported")
+#if JPEG_LIB_VERSION >= 70
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+ "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
+#endif
+JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition")
+JMESSAGE(JERR_BAD_IN_COLORSPACE, "Bogus input colorspace")
+JMESSAGE(JERR_BAD_J_COLORSPACE, "Bogus JPEG colorspace")
+JMESSAGE(JERR_BAD_LENGTH, "Bogus marker length")
+JMESSAGE(JERR_BAD_LIB_VERSION,
+ "Wrong JPEG library version: library is %d, caller expects %d")
+JMESSAGE(JERR_BAD_MCU_SIZE, "Sampling factors too large for interleaved scan")
+JMESSAGE(JERR_BAD_POOL_ID, "Invalid memory pool code %d")
+JMESSAGE(JERR_BAD_PRECISION, "Unsupported JPEG data precision %d")
+JMESSAGE(JERR_BAD_PROGRESSION,
+ "Invalid progressive parameters Ss=%d Se=%d Ah=%d Al=%d")
+JMESSAGE(JERR_BAD_PROG_SCRIPT,
+ "Invalid progressive parameters at scan script entry %d")
+JMESSAGE(JERR_BAD_SAMPLING, "Bogus sampling factors")
+JMESSAGE(JERR_BAD_SCAN_SCRIPT, "Invalid scan script at entry %d")
+JMESSAGE(JERR_BAD_STATE, "Improper call to JPEG library in state %d")
+JMESSAGE(JERR_BAD_STRUCT_SIZE,
+ "JPEG parameter struct mismatch: library thinks size is %u, caller expects %u")
+JMESSAGE(JERR_BAD_VIRTUAL_ACCESS, "Bogus virtual array access")
+JMESSAGE(JERR_BUFFER_SIZE, "Buffer passed to JPEG library is too small")
+JMESSAGE(JERR_CANT_SUSPEND, "Suspension not allowed here")
+JMESSAGE(JERR_CCIR601_NOTIMPL, "CCIR601 sampling not implemented yet")
+JMESSAGE(JERR_COMPONENT_COUNT, "Too many color components: %d, max %d")
+JMESSAGE(JERR_CONVERSION_NOTIMPL, "Unsupported color conversion request")
+JMESSAGE(JERR_DAC_INDEX, "Bogus DAC index %d")
+JMESSAGE(JERR_DAC_VALUE, "Bogus DAC value 0x%x")
+JMESSAGE(JERR_DHT_INDEX, "Bogus DHT index %d")
+JMESSAGE(JERR_DQT_INDEX, "Bogus DQT index %d")
+JMESSAGE(JERR_EMPTY_IMAGE, "Empty JPEG image (DNL not supported)")
+JMESSAGE(JERR_EMS_READ, "Read from EMS failed")
+JMESSAGE(JERR_EMS_WRITE, "Write to EMS failed")
+JMESSAGE(JERR_EOI_EXPECTED, "Didn't expect more than one scan")
+JMESSAGE(JERR_FILE_READ, "Input file read error")
+JMESSAGE(JERR_FILE_WRITE, "Output file write error --- out of disk space?")
+JMESSAGE(JERR_FRACT_SAMPLE_NOTIMPL, "Fractional sampling not implemented yet")
+JMESSAGE(JERR_HUFF_CLEN_OVERFLOW, "Huffman code size table overflow")
+JMESSAGE(JERR_HUFF_MISSING_CODE, "Missing Huffman code table entry")
+JMESSAGE(JERR_IMAGE_TOO_BIG, "Maximum supported image dimension is %u pixels")
+JMESSAGE(JERR_INPUT_EMPTY, "Empty input file")
+JMESSAGE(JERR_INPUT_EOF, "Premature end of input file")
+JMESSAGE(JERR_MISMATCHED_QUANT_TABLE,
+ "Cannot transcode due to multiple use of quantization table %d")
+JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
+JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
+JMESSAGE(JERR_NOTIMPL, "Requested features are incompatible")
+JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
+#if JPEG_LIB_VERSION >= 70
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
+#endif
+JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported")
+JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined")
+JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image")
+JMESSAGE(JERR_NO_QUANT_TABLE, "Quantization table 0x%02x was not defined")
+JMESSAGE(JERR_NO_SOI, "Not a JPEG file: starts with 0x%02x 0x%02x")
+JMESSAGE(JERR_OUT_OF_MEMORY, "Insufficient memory (case %d)")
+JMESSAGE(JERR_QUANT_COMPONENTS,
+ "Cannot quantize more than %d color components")
+JMESSAGE(JERR_QUANT_FEW_COLORS, "Cannot quantize to fewer than %d colors")
+JMESSAGE(JERR_QUANT_MANY_COLORS, "Cannot quantize to more than %d colors")
+JMESSAGE(JERR_SOF_DUPLICATE, "Invalid JPEG file structure: two SOF markers")
+JMESSAGE(JERR_SOF_NO_SOS, "Invalid JPEG file structure: missing SOS marker")
+JMESSAGE(JERR_SOF_UNSUPPORTED, "Unsupported JPEG process: SOF type 0x%02x")
+JMESSAGE(JERR_SOI_DUPLICATE, "Invalid JPEG file structure: two SOI markers")
+JMESSAGE(JERR_SOS_NO_SOF, "Invalid JPEG file structure: SOS before SOF")
+JMESSAGE(JERR_TFILE_CREATE, "Failed to create temporary file %s")
+JMESSAGE(JERR_TFILE_READ, "Read failed on temporary file")
+JMESSAGE(JERR_TFILE_SEEK, "Seek failed on temporary file")
+JMESSAGE(JERR_TFILE_WRITE,
+ "Write failed on temporary file --- out of disk space?")
+JMESSAGE(JERR_TOO_LITTLE_DATA, "Application transferred too few scanlines")
+JMESSAGE(JERR_UNKNOWN_MARKER, "Unsupported marker type 0x%02x")
+JMESSAGE(JERR_VIRTUAL_BUG, "Virtual array controller messed up")
+JMESSAGE(JERR_WIDTH_OVERFLOW, "Image too wide for this implementation")
+JMESSAGE(JERR_XMS_READ, "Read from XMS failed")
+JMESSAGE(JERR_XMS_WRITE, "Write to XMS failed")
+JMESSAGE(JMSG_COPYRIGHT, JCOPYRIGHT_SHORT)
+JMESSAGE(JMSG_VERSION, JVERSION)
+JMESSAGE(JTRC_16BIT_TABLES,
+ "Caution: quantization tables are too coarse for baseline JPEG")
+JMESSAGE(JTRC_ADOBE,
+ "Adobe APP14 marker: version %d, flags 0x%04x 0x%04x, transform %d")
+JMESSAGE(JTRC_APP0, "Unknown APP0 marker (not JFIF), length %u")
+JMESSAGE(JTRC_APP14, "Unknown APP14 marker (not Adobe), length %u")
+JMESSAGE(JTRC_DAC, "Define Arithmetic Table 0x%02x: 0x%02x")
+JMESSAGE(JTRC_DHT, "Define Huffman Table 0x%02x")
+JMESSAGE(JTRC_DQT, "Define Quantization Table %d precision %d")
+JMESSAGE(JTRC_DRI, "Define Restart Interval %u")
+JMESSAGE(JTRC_EMS_CLOSE, "Freed EMS handle %u")
+JMESSAGE(JTRC_EMS_OPEN, "Obtained EMS handle %u")
+JMESSAGE(JTRC_EOI, "End Of Image")
+JMESSAGE(JTRC_HUFFBITS, " %3d %3d %3d %3d %3d %3d %3d %3d")
+JMESSAGE(JTRC_JFIF, "JFIF APP0 marker: version %d.%02d, density %dx%d %d")
+JMESSAGE(JTRC_JFIF_BADTHUMBNAILSIZE,
+ "Warning: thumbnail image size does not match data length %u")
+JMESSAGE(JTRC_JFIF_EXTENSION, "JFIF extension marker: type 0x%02x, length %u")
+JMESSAGE(JTRC_JFIF_THUMBNAIL, " with %d x %d thumbnail image")
+JMESSAGE(JTRC_MISC_MARKER, "Miscellaneous marker 0x%02x, length %u")
+JMESSAGE(JTRC_PARMLESS_MARKER, "Unexpected marker 0x%02x")
+JMESSAGE(JTRC_QUANTVALS, " %4u %4u %4u %4u %4u %4u %4u %4u")
+JMESSAGE(JTRC_QUANT_3_NCOLORS, "Quantizing to %d = %d*%d*%d colors")
+JMESSAGE(JTRC_QUANT_NCOLORS, "Quantizing to %d colors")
+JMESSAGE(JTRC_QUANT_SELECTED, "Selected %d colors for quantization")
+JMESSAGE(JTRC_RECOVERY_ACTION, "At marker 0x%02x, recovery action %d")
+JMESSAGE(JTRC_RST, "RST%d")
+JMESSAGE(JTRC_SMOOTH_NOTIMPL,
+ "Smoothing not supported with nonstandard sampling ratios")
+JMESSAGE(JTRC_SOF, "Start Of Frame 0x%02x: width=%u, height=%u, components=%d")
+JMESSAGE(JTRC_SOF_COMPONENT, " Component %d: %dhx%dv q=%d")
+JMESSAGE(JTRC_SOI, "Start of Image")
+JMESSAGE(JTRC_SOS, "Start Of Scan: %d components")
+JMESSAGE(JTRC_SOS_COMPONENT, " Component %d: dc=%d ac=%d")
+JMESSAGE(JTRC_SOS_PARAMS, " Ss=%d, Se=%d, Ah=%d, Al=%d")
+JMESSAGE(JTRC_TFILE_CLOSE, "Closed temporary file %s")
+JMESSAGE(JTRC_TFILE_OPEN, "Opened temporary file %s")
+JMESSAGE(JTRC_THUMB_JPEG,
+ "JFIF extension marker: JPEG-compressed thumbnail image, length %u")
+JMESSAGE(JTRC_THUMB_PALETTE,
+ "JFIF extension marker: palette thumbnail image, length %u")
+JMESSAGE(JTRC_THUMB_RGB,
+ "JFIF extension marker: RGB thumbnail image, length %u")
+JMESSAGE(JTRC_UNKNOWN_IDS,
+ "Unrecognized component IDs %d %d %d, assuming YCbCr")
+JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u")
+JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u")
+JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d")
+#if JPEG_LIB_VERSION >= 70
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
+#endif
+JMESSAGE(JWRN_BOGUS_PROGRESSION,
+ "Inconsistent progression sequence for component %d coefficient %d")
+JMESSAGE(JWRN_EXTRANEOUS_DATA,
+ "Corrupt JPEG data: %u extraneous bytes before marker 0x%02x")
+JMESSAGE(JWRN_HIT_MARKER, "Corrupt JPEG data: premature end of data segment")
+JMESSAGE(JWRN_HUFF_BAD_CODE, "Corrupt JPEG data: bad Huffman code")
+JMESSAGE(JWRN_JFIF_MAJOR, "Warning: unknown JFIF revision number %d.%02d")
+JMESSAGE(JWRN_JPEG_EOF, "Premature end of JPEG file")
+JMESSAGE(JWRN_MUST_RESYNC,
+ "Corrupt JPEG data: found marker 0x%02x instead of RST%d")
+JMESSAGE(JWRN_NOT_SEQUENTIAL, "Invalid SOS parameters for sequential JPEG")
+JMESSAGE(JWRN_TOO_MUCH_DATA, "Application transferred too many scanlines")
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
+#if defined(C_ARITH_CODING_SUPPORTED) || defined(D_ARITH_CODING_SUPPORTED)
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
+#endif
+#endif
+JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+ "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
+#endif
+
+#ifdef JMAKE_ENUM_LIST
+
+ JMSG_LASTMSGCODE
+} J_MESSAGE_CODE;
+
+#undef JMAKE_ENUM_LIST
+#endif /* JMAKE_ENUM_LIST */
+
+/* Zap JMESSAGE macro so that future re-inclusions do nothing by default */
+#undef JMESSAGE
+
+
+#ifndef JERROR_H
+#define JERROR_H
+
+/* Macros to simplify using the error and trace message stuff */
+/* The first parameter is either type of cinfo pointer */
+
+/* Fatal errors (print message and exit) */
+#define ERREXIT(cinfo, code) \
+ ((cinfo)->err->msg_code = (code), \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT1(cinfo, code, p1) \
+ ((cinfo)->err->msg_code = (code), \
+ (cinfo)->err->msg_parm.i[0] = (p1), \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT2(cinfo, code, p1, p2) \
+ ((cinfo)->err->msg_code = (code), \
+ (cinfo)->err->msg_parm.i[0] = (p1), \
+ (cinfo)->err->msg_parm.i[1] = (p2), \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT3(cinfo, code, p1, p2, p3) \
+ ((cinfo)->err->msg_code = (code), \
+ (cinfo)->err->msg_parm.i[0] = (p1), \
+ (cinfo)->err->msg_parm.i[1] = (p2), \
+ (cinfo)->err->msg_parm.i[2] = (p3), \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT4(cinfo, code, p1, p2, p3, p4) \
+ ((cinfo)->err->msg_code = (code), \
+ (cinfo)->err->msg_parm.i[0] = (p1), \
+ (cinfo)->err->msg_parm.i[1] = (p2), \
+ (cinfo)->err->msg_parm.i[2] = (p3), \
+ (cinfo)->err->msg_parm.i[3] = (p4), \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT6(cinfo, code, p1, p2, p3, p4, p5, p6) \
+ ((cinfo)->err->msg_code = (code), \
+ (cinfo)->err->msg_parm.i[0] = (p1), \
+ (cinfo)->err->msg_parm.i[1] = (p2), \
+ (cinfo)->err->msg_parm.i[2] = (p3), \
+ (cinfo)->err->msg_parm.i[3] = (p4), \
+ (cinfo)->err->msg_parm.i[4] = (p5), \
+ (cinfo)->err->msg_parm.i[5] = (p6), \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXITS(cinfo, code, str) \
+ ((cinfo)->err->msg_code = (code), \
+ strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
+ (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
+ (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+
+#define MAKESTMT(stuff) do { stuff } while (0)
+
+/* Nonfatal errors (we can keep going, but the data is probably corrupt) */
+#define WARNMS(cinfo, code) \
+ ((cinfo)->err->msg_code = (code), \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+#define WARNMS1(cinfo, code, p1) \
+ ((cinfo)->err->msg_code = (code), \
+ (cinfo)->err->msg_parm.i[0] = (p1), \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+#define WARNMS2(cinfo, code, p1, p2) \
+ ((cinfo)->err->msg_code = (code), \
+ (cinfo)->err->msg_parm.i[0] = (p1), \
+ (cinfo)->err->msg_parm.i[1] = (p2), \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+
+/* Informational/debugging messages */
+#define TRACEMS(cinfo, lvl, code) \
+ ((cinfo)->err->msg_code = (code), \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS1(cinfo, lvl, code, p1) \
+ ((cinfo)->err->msg_code = (code), \
+ (cinfo)->err->msg_parm.i[0] = (p1), \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS2(cinfo, lvl, code, p1, p2) \
+ ((cinfo)->err->msg_code = (code), \
+ (cinfo)->err->msg_parm.i[0] = (p1), \
+ (cinfo)->err->msg_parm.i[1] = (p2), \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS3(cinfo, lvl, code, p1, p2, p3) \
+ MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+ _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); \
+ (cinfo)->err->msg_code = (code); \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS4(cinfo, lvl, code, p1, p2, p3, p4) \
+ MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+ _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+ (cinfo)->err->msg_code = (code); \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS5(cinfo, lvl, code, p1, p2, p3, p4, p5) \
+ MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+ _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+ _mp[4] = (p5); \
+ (cinfo)->err->msg_code = (code); \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS8(cinfo, lvl, code, p1, p2, p3, p4, p5, p6, p7, p8) \
+ MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+ _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+ _mp[4] = (p5); _mp[5] = (p6); _mp[6] = (p7); _mp[7] = (p8); \
+ (cinfo)->err->msg_code = (code); \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMSS(cinfo, lvl, code, str) \
+ ((cinfo)->err->msg_code = (code), \
+ strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
+ (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
+ (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+
+#endif /* JERROR_H */
diff --git a/media/libjpeg/jfdctflt.c b/media/libjpeg/jfdctflt.c
new file mode 100644
index 0000000000..ab6f6d0825
--- /dev/null
+++ b/media/libjpeg/jfdctflt.c
@@ -0,0 +1,169 @@
+/*
+ * jfdctflt.c
+ *
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains a floating-point implementation of the
+ * forward DCT (Discrete Cosine Transform).
+ *
+ * This implementation should be more accurate than either of the integer
+ * DCT implementations. However, it may not give the same results on all
+ * machines because of differences in roundoff behavior. Speed will depend
+ * on the hardware's floating point capacity.
+ *
+ * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
+ * on each column. Direct algorithms are also available, but they are
+ * much more complex and seem not to be any faster when reduced to code.
+ *
+ * This implementation is based on Arai, Agui, and Nakajima's algorithm for
+ * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
+ * Japanese, but the algorithm is described in the Pennebaker & Mitchell
+ * JPEG textbook (see REFERENCES section in file README.ijg). The following
+ * code is based directly on figure 4-8 in P&M.
+ * While an 8-point DCT cannot be done in less than 11 multiplies, it is
+ * possible to arrange the computation so that many of the multiplies are
+ * simple scalings of the final outputs. These multiplies can then be
+ * folded into the multiplications or divisions by the JPEG quantization
+ * table entries. The AA&N method leaves only 5 multiplies and 29 adds
+ * to be done in the DCT itself.
+ * The primary disadvantage of this method is that with a fixed-point
+ * implementation, accuracy is lost due to imprecise representation of the
+ * scaled quantization values. However, that problem does not arise if
+ * we use floating point arithmetic.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h" /* Private declarations for DCT subsystem */
+
+#ifdef DCT_FLOAT_SUPPORTED
+
+
+/*
+ * This module is specialized to the case DCTSIZE = 8.
+ */
+
+#if DCTSIZE != 8
+ Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+#endif
+
+
+/*
+ * Perform the forward DCT on one block of samples.
+ */
+
+GLOBAL(void)
+jpeg_fdct_float(FAST_FLOAT *data)
+{
+ FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
+ FAST_FLOAT z1, z2, z3, z4, z5, z11, z13;
+ FAST_FLOAT *dataptr;
+ int ctr;
+
+ /* Pass 1: process rows. */
+
+ dataptr = data;
+ for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+ tmp0 = dataptr[0] + dataptr[7];
+ tmp7 = dataptr[0] - dataptr[7];
+ tmp1 = dataptr[1] + dataptr[6];
+ tmp6 = dataptr[1] - dataptr[6];
+ tmp2 = dataptr[2] + dataptr[5];
+ tmp5 = dataptr[2] - dataptr[5];
+ tmp3 = dataptr[3] + dataptr[4];
+ tmp4 = dataptr[3] - dataptr[4];
+
+ /* Even part */
+
+ tmp10 = tmp0 + tmp3; /* phase 2 */
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ dataptr[0] = tmp10 + tmp11; /* phase 3 */
+ dataptr[4] = tmp10 - tmp11;
+
+ z1 = (tmp12 + tmp13) * ((FAST_FLOAT)0.707106781); /* c4 */
+ dataptr[2] = tmp13 + z1; /* phase 5 */
+ dataptr[6] = tmp13 - z1;
+
+ /* Odd part */
+
+ tmp10 = tmp4 + tmp5; /* phase 2 */
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ /* The rotator is modified from fig 4-8 to avoid extra negations. */
+ z5 = (tmp10 - tmp12) * ((FAST_FLOAT)0.382683433); /* c6 */
+ z2 = ((FAST_FLOAT)0.541196100) * tmp10 + z5; /* c2-c6 */
+ z4 = ((FAST_FLOAT)1.306562965) * tmp12 + z5; /* c2+c6 */
+ z3 = tmp11 * ((FAST_FLOAT)0.707106781); /* c4 */
+
+ z11 = tmp7 + z3; /* phase 5 */
+ z13 = tmp7 - z3;
+
+ dataptr[5] = z13 + z2; /* phase 6 */
+ dataptr[3] = z13 - z2;
+ dataptr[1] = z11 + z4;
+ dataptr[7] = z11 - z4;
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns. */
+
+ dataptr = data;
+ for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+ tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+ tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+ tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+ tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+ tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+ tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+ tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+ tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+
+ /* Even part */
+
+ tmp10 = tmp0 + tmp3; /* phase 2 */
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ dataptr[DCTSIZE * 0] = tmp10 + tmp11; /* phase 3 */
+ dataptr[DCTSIZE * 4] = tmp10 - tmp11;
+
+ z1 = (tmp12 + tmp13) * ((FAST_FLOAT)0.707106781); /* c4 */
+ dataptr[DCTSIZE * 2] = tmp13 + z1; /* phase 5 */
+ dataptr[DCTSIZE * 6] = tmp13 - z1;
+
+ /* Odd part */
+
+ tmp10 = tmp4 + tmp5; /* phase 2 */
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ /* The rotator is modified from fig 4-8 to avoid extra negations. */
+ z5 = (tmp10 - tmp12) * ((FAST_FLOAT)0.382683433); /* c6 */
+ z2 = ((FAST_FLOAT)0.541196100) * tmp10 + z5; /* c2-c6 */
+ z4 = ((FAST_FLOAT)1.306562965) * tmp12 + z5; /* c2+c6 */
+ z3 = tmp11 * ((FAST_FLOAT)0.707106781); /* c4 */
+
+ z11 = tmp7 + z3; /* phase 5 */
+ z13 = tmp7 - z3;
+
+ dataptr[DCTSIZE * 5] = z13 + z2; /* phase 6 */
+ dataptr[DCTSIZE * 3] = z13 - z2;
+ dataptr[DCTSIZE * 1] = z11 + z4;
+ dataptr[DCTSIZE * 7] = z11 - z4;
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+#endif /* DCT_FLOAT_SUPPORTED */
diff --git a/media/libjpeg/jfdctfst.c b/media/libjpeg/jfdctfst.c
new file mode 100644
index 0000000000..4c9ce0de8f
--- /dev/null
+++ b/media/libjpeg/jfdctfst.c
@@ -0,0 +1,227 @@
+/*
+ * jfdctfst.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains a fast, not so accurate integer implementation of the
+ * forward DCT (Discrete Cosine Transform).
+ *
+ * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
+ * on each column. Direct algorithms are also available, but they are
+ * much more complex and seem not to be any faster when reduced to code.
+ *
+ * This implementation is based on Arai, Agui, and Nakajima's algorithm for
+ * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
+ * Japanese, but the algorithm is described in the Pennebaker & Mitchell
+ * JPEG textbook (see REFERENCES section in file README.ijg). The following
+ * code is based directly on figure 4-8 in P&M.
+ * While an 8-point DCT cannot be done in less than 11 multiplies, it is
+ * possible to arrange the computation so that many of the multiplies are
+ * simple scalings of the final outputs. These multiplies can then be
+ * folded into the multiplications or divisions by the JPEG quantization
+ * table entries. The AA&N method leaves only 5 multiplies and 29 adds
+ * to be done in the DCT itself.
+ * The primary disadvantage of this method is that with fixed-point math,
+ * accuracy is lost due to imprecise representation of the scaled
+ * quantization values. The smaller the quantization table entry, the less
+ * precise the scaled value, so this implementation does worse with high-
+ * quality-setting files than with low-quality ones.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h" /* Private declarations for DCT subsystem */
+
+#ifdef DCT_IFAST_SUPPORTED
+
+
+/*
+ * This module is specialized to the case DCTSIZE = 8.
+ */
+
+#if DCTSIZE != 8
+ Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+#endif
+
+
+/* Scaling decisions are generally the same as in the LL&M algorithm;
+ * see jfdctint.c for more details. However, we choose to descale
+ * (right shift) multiplication products as soon as they are formed,
+ * rather than carrying additional fractional bits into subsequent additions.
+ * This compromises accuracy slightly, but it lets us save a few shifts.
+ * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
+ * everywhere except in the multiplications proper; this saves a good deal
+ * of work on 16-bit-int machines.
+ *
+ * Again to save a few shifts, the intermediate results between pass 1 and
+ * pass 2 are not upscaled, but are represented only to integral precision.
+ *
+ * A final compromise is to represent the multiplicative constants to only
+ * 8 fractional bits, rather than 13. This saves some shifting work on some
+ * machines, and may also reduce the cost of multiplication (since there
+ * are fewer one-bits in the constants).
+ */
+
+#define CONST_BITS 8
+
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+ * causing a lot of useless floating-point operations at run time.
+ * To get around this we use the following pre-calculated constants.
+ * If you change CONST_BITS you may want to add appropriate values.
+ * (With a reasonable C compiler, you can just rely on the FIX() macro...)
+ */
+
+#if CONST_BITS == 8
+#define FIX_0_382683433 ((JLONG)98) /* FIX(0.382683433) */
+#define FIX_0_541196100 ((JLONG)139) /* FIX(0.541196100) */
+#define FIX_0_707106781 ((JLONG)181) /* FIX(0.707106781) */
+#define FIX_1_306562965 ((JLONG)334) /* FIX(1.306562965) */
+#else
+#define FIX_0_382683433 FIX(0.382683433)
+#define FIX_0_541196100 FIX(0.541196100)
+#define FIX_0_707106781 FIX(0.707106781)
+#define FIX_1_306562965 FIX(1.306562965)
+#endif
+
+
+/* We can gain a little more speed, with a further compromise in accuracy,
+ * by omitting the addition in a descaling shift. This yields an incorrectly
+ * rounded result half the time...
+ */
+
+#ifndef USE_ACCURATE_ROUNDING
+#undef DESCALE
+#define DESCALE(x, n) RIGHT_SHIFT(x, n)
+#endif
+
+
+/* Multiply a DCTELEM variable by an JLONG constant, and immediately
+ * descale to yield a DCTELEM result.
+ */
+
+#define MULTIPLY(var, const) ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
+
+
+/*
+ * Perform the forward DCT on one block of samples.
+ */
+
+GLOBAL(void)
+jpeg_fdct_ifast(DCTELEM *data)
+{
+ DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ DCTELEM tmp10, tmp11, tmp12, tmp13;
+ DCTELEM z1, z2, z3, z4, z5, z11, z13;
+ DCTELEM *dataptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pass 1: process rows. */
+
+ dataptr = data;
+ for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+ tmp0 = dataptr[0] + dataptr[7];
+ tmp7 = dataptr[0] - dataptr[7];
+ tmp1 = dataptr[1] + dataptr[6];
+ tmp6 = dataptr[1] - dataptr[6];
+ tmp2 = dataptr[2] + dataptr[5];
+ tmp5 = dataptr[2] - dataptr[5];
+ tmp3 = dataptr[3] + dataptr[4];
+ tmp4 = dataptr[3] - dataptr[4];
+
+ /* Even part */
+
+ tmp10 = tmp0 + tmp3; /* phase 2 */
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ dataptr[0] = tmp10 + tmp11; /* phase 3 */
+ dataptr[4] = tmp10 - tmp11;
+
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
+ dataptr[2] = tmp13 + z1; /* phase 5 */
+ dataptr[6] = tmp13 - z1;
+
+ /* Odd part */
+
+ tmp10 = tmp4 + tmp5; /* phase 2 */
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ /* The rotator is modified from fig 4-8 to avoid extra negations. */
+ z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
+ z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
+ z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
+ z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
+
+ z11 = tmp7 + z3; /* phase 5 */
+ z13 = tmp7 - z3;
+
+ dataptr[5] = z13 + z2; /* phase 6 */
+ dataptr[3] = z13 - z2;
+ dataptr[1] = z11 + z4;
+ dataptr[7] = z11 - z4;
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns. */
+
+ dataptr = data;
+ for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+ tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+ tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+ tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+ tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+ tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+ tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+ tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+ tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+
+ /* Even part */
+
+ tmp10 = tmp0 + tmp3; /* phase 2 */
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ dataptr[DCTSIZE * 0] = tmp10 + tmp11; /* phase 3 */
+ dataptr[DCTSIZE * 4] = tmp10 - tmp11;
+
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
+ dataptr[DCTSIZE * 2] = tmp13 + z1; /* phase 5 */
+ dataptr[DCTSIZE * 6] = tmp13 - z1;
+
+ /* Odd part */
+
+ tmp10 = tmp4 + tmp5; /* phase 2 */
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ /* The rotator is modified from fig 4-8 to avoid extra negations. */
+ z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
+ z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
+ z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
+ z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
+
+ z11 = tmp7 + z3; /* phase 5 */
+ z13 = tmp7 - z3;
+
+ dataptr[DCTSIZE * 5] = z13 + z2; /* phase 6 */
+ dataptr[DCTSIZE * 3] = z13 - z2;
+ dataptr[DCTSIZE * 1] = z11 + z4;
+ dataptr[DCTSIZE * 7] = z11 - z4;
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+#endif /* DCT_IFAST_SUPPORTED */
diff --git a/media/libjpeg/jfdctint.c b/media/libjpeg/jfdctint.c
new file mode 100644
index 0000000000..c95a3a7fb8
--- /dev/null
+++ b/media/libjpeg/jfdctint.c
@@ -0,0 +1,288 @@
+/*
+ * jfdctint.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, 2020, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains a slower but more accurate integer implementation of the
+ * forward DCT (Discrete Cosine Transform).
+ *
+ * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
+ * on each column. Direct algorithms are also available, but they are
+ * much more complex and seem not to be any faster when reduced to code.
+ *
+ * This implementation is based on an algorithm described in
+ * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
+ * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
+ * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
+ * The primary algorithm described there uses 11 multiplies and 29 adds.
+ * We use their alternate method with 12 multiplies and 32 adds.
+ * The advantage of this method is that no data path contains more than one
+ * multiplication; this allows a very simple and accurate implementation in
+ * scaled fixed-point arithmetic, with a minimal number of shifts.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h" /* Private declarations for DCT subsystem */
+
+#ifdef DCT_ISLOW_SUPPORTED
+
+
+/*
+ * This module is specialized to the case DCTSIZE = 8.
+ */
+
+#if DCTSIZE != 8
+ Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+#endif
+
+
+/*
+ * The poop on this scaling stuff is as follows:
+ *
+ * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
+ * larger than the true DCT outputs. The final outputs are therefore
+ * a factor of N larger than desired; since N=8 this can be cured by
+ * a simple right shift at the end of the algorithm. The advantage of
+ * this arrangement is that we save two multiplications per 1-D DCT,
+ * because the y0 and y4 outputs need not be divided by sqrt(N).
+ * In the IJG code, this factor of 8 is removed by the quantization step
+ * (in jcdctmgr.c), NOT in this module.
+ *
+ * We have to do addition and subtraction of the integer inputs, which
+ * is no problem, and multiplication by fractional constants, which is
+ * a problem to do in integer arithmetic. We multiply all the constants
+ * by CONST_SCALE and convert them to integer constants (thus retaining
+ * CONST_BITS bits of precision in the constants). After doing a
+ * multiplication we have to divide the product by CONST_SCALE, with proper
+ * rounding, to produce the correct output. This division can be done
+ * cheaply as a right shift of CONST_BITS bits. We postpone shifting
+ * as long as possible so that partial sums can be added together with
+ * full fractional precision.
+ *
+ * The outputs of the first pass are scaled up by PASS1_BITS bits so that
+ * they are represented to better-than-integral precision. These outputs
+ * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
+ * with the recommended scaling. (For 12-bit sample data, the intermediate
+ * array is JLONG anyway.)
+ *
+ * To avoid overflow of the 32-bit intermediate results in pass 2, we must
+ * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
+ * shows that the values given below are the most effective.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#else
+#define CONST_BITS 13
+#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
+#endif
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+ * causing a lot of useless floating-point operations at run time.
+ * To get around this we use the following pre-calculated constants.
+ * If you change CONST_BITS you may want to add appropriate values.
+ * (With a reasonable C compiler, you can just rely on the FIX() macro...)
+ */
+
+#if CONST_BITS == 13
+#define FIX_0_298631336 ((JLONG)2446) /* FIX(0.298631336) */
+#define FIX_0_390180644 ((JLONG)3196) /* FIX(0.390180644) */
+#define FIX_0_541196100 ((JLONG)4433) /* FIX(0.541196100) */
+#define FIX_0_765366865 ((JLONG)6270) /* FIX(0.765366865) */
+#define FIX_0_899976223 ((JLONG)7373) /* FIX(0.899976223) */
+#define FIX_1_175875602 ((JLONG)9633) /* FIX(1.175875602) */
+#define FIX_1_501321110 ((JLONG)12299) /* FIX(1.501321110) */
+#define FIX_1_847759065 ((JLONG)15137) /* FIX(1.847759065) */
+#define FIX_1_961570560 ((JLONG)16069) /* FIX(1.961570560) */
+#define FIX_2_053119869 ((JLONG)16819) /* FIX(2.053119869) */
+#define FIX_2_562915447 ((JLONG)20995) /* FIX(2.562915447) */
+#define FIX_3_072711026 ((JLONG)25172) /* FIX(3.072711026) */
+#else
+#define FIX_0_298631336 FIX(0.298631336)
+#define FIX_0_390180644 FIX(0.390180644)
+#define FIX_0_541196100 FIX(0.541196100)
+#define FIX_0_765366865 FIX(0.765366865)
+#define FIX_0_899976223 FIX(0.899976223)
+#define FIX_1_175875602 FIX(1.175875602)
+#define FIX_1_501321110 FIX(1.501321110)
+#define FIX_1_847759065 FIX(1.847759065)
+#define FIX_1_961570560 FIX(1.961570560)
+#define FIX_2_053119869 FIX(2.053119869)
+#define FIX_2_562915447 FIX(2.562915447)
+#define FIX_3_072711026 FIX(3.072711026)
+#endif
+
+
+/* Multiply an JLONG variable by an JLONG constant to yield an JLONG result.
+ * For 8-bit samples with the recommended scaling, all the variable
+ * and constant values involved are no more than 16 bits wide, so a
+ * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
+ * For 12-bit samples, a full 32-bit multiplication will be needed.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define MULTIPLY(var, const) MULTIPLY16C16(var, const)
+#else
+#define MULTIPLY(var, const) ((var) * (const))
+#endif
+
+
+/*
+ * Perform the forward DCT on one block of samples.
+ */
+
+GLOBAL(void)
+jpeg_fdct_islow(DCTELEM *data)
+{
+ JLONG tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ JLONG tmp10, tmp11, tmp12, tmp13;
+ JLONG z1, z2, z3, z4, z5;
+ DCTELEM *dataptr;
+ int ctr;
+ SHIFT_TEMPS
+
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+ dataptr = data;
+ for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+ tmp0 = dataptr[0] + dataptr[7];
+ tmp7 = dataptr[0] - dataptr[7];
+ tmp1 = dataptr[1] + dataptr[6];
+ tmp6 = dataptr[1] - dataptr[6];
+ tmp2 = dataptr[2] + dataptr[5];
+ tmp5 = dataptr[2] - dataptr[5];
+ tmp3 = dataptr[3] + dataptr[4];
+ tmp4 = dataptr[3] - dataptr[4];
+
+ /* Even part per LL&M figure 1 --- note that published figure is faulty;
+ * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+ */
+
+ tmp10 = tmp0 + tmp3;
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+ dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+ dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+ CONST_BITS - PASS1_BITS);
+ dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, -FIX_1_847759065),
+ CONST_BITS - PASS1_BITS);
+
+ /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+ * cK represents cos(K*pi/16).
+ * i0..i3 in the paper are tmp4..tmp7 here.
+ */
+
+ z1 = tmp4 + tmp7;
+ z2 = tmp5 + tmp6;
+ z3 = tmp4 + tmp6;
+ z4 = tmp5 + tmp7;
+ z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+
+ tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+ z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+ z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
+
+ z3 += z5;
+ z4 += z5;
+
+ dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
+ dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
+ dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
+ dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
+
+ dataptr += DCTSIZE; /* advance pointer to next row */
+ }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ */
+
+ dataptr = data;
+ for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+ tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+ tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+ tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+ tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+ tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+ tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+ tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+ tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+
+ /* Even part per LL&M figure 1 --- note that published figure is faulty;
+ * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+ */
+
+ tmp10 = tmp0 + tmp3;
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ dataptr[DCTSIZE * 0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS);
+ dataptr[DCTSIZE * 4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS);
+
+ z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+ dataptr[DCTSIZE * 2] =
+ (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+ CONST_BITS + PASS1_BITS);
+ dataptr[DCTSIZE * 6] =
+ (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, -FIX_1_847759065),
+ CONST_BITS + PASS1_BITS);
+
+ /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+ * cK represents cos(K*pi/16).
+ * i0..i3 in the paper are tmp4..tmp7 here.
+ */
+
+ z1 = tmp4 + tmp7;
+ z2 = tmp5 + tmp6;
+ z3 = tmp4 + tmp6;
+ z4 = tmp5 + tmp7;
+ z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+
+ tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+ z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+ z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
+
+ z3 += z5;
+ z4 += z5;
+
+ dataptr[DCTSIZE * 7] = (DCTELEM)DESCALE(tmp4 + z1 + z3,
+ CONST_BITS + PASS1_BITS);
+ dataptr[DCTSIZE * 5] = (DCTELEM)DESCALE(tmp5 + z2 + z4,
+ CONST_BITS + PASS1_BITS);
+ dataptr[DCTSIZE * 3] = (DCTELEM)DESCALE(tmp6 + z2 + z3,
+ CONST_BITS + PASS1_BITS);
+ dataptr[DCTSIZE * 1] = (DCTELEM)DESCALE(tmp7 + z1 + z4,
+ CONST_BITS + PASS1_BITS);
+
+ dataptr++; /* advance pointer to next column */
+ }
+}
+
+#endif /* DCT_ISLOW_SUPPORTED */
diff --git a/media/libjpeg/jidctflt.c b/media/libjpeg/jidctflt.c
new file mode 100644
index 0000000000..5aee74e232
--- /dev/null
+++ b/media/libjpeg/jidctflt.c
@@ -0,0 +1,240 @@
+/*
+ * jidctflt.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2010 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2014, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains a floating-point implementation of the
+ * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
+ * must also perform dequantization of the input coefficients.
+ *
+ * This implementation should be more accurate than either of the integer
+ * IDCT implementations. However, it may not give the same results on all
+ * machines because of differences in roundoff behavior. Speed will depend
+ * on the hardware's floating point capacity.
+ *
+ * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
+ * on each row (or vice versa, but it's more convenient to emit a row at
+ * a time). Direct algorithms are also available, but they are much more
+ * complex and seem not to be any faster when reduced to code.
+ *
+ * This implementation is based on Arai, Agui, and Nakajima's algorithm for
+ * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
+ * Japanese, but the algorithm is described in the Pennebaker & Mitchell
+ * JPEG textbook (see REFERENCES section in file README.ijg). The following
+ * code is based directly on figure 4-8 in P&M.
+ * While an 8-point DCT cannot be done in less than 11 multiplies, it is
+ * possible to arrange the computation so that many of the multiplies are
+ * simple scalings of the final outputs. These multiplies can then be
+ * folded into the multiplications or divisions by the JPEG quantization
+ * table entries. The AA&N method leaves only 5 multiplies and 29 adds
+ * to be done in the DCT itself.
+ * The primary disadvantage of this method is that with a fixed-point
+ * implementation, accuracy is lost due to imprecise representation of the
+ * scaled quantization values. However, that problem does not arise if
+ * we use floating point arithmetic.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h" /* Private declarations for DCT subsystem */
+
+#ifdef DCT_FLOAT_SUPPORTED
+
+
+/*
+ * This module is specialized to the case DCTSIZE = 8.
+ */
+
+#if DCTSIZE != 8
+ Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+#endif
+
+
+/* Dequantize a coefficient by multiplying it by the multiplier-table
+ * entry; produce a float result.
+ */
+
+#define DEQUANTIZE(coef, quantval) (((FAST_FLOAT)(coef)) * (quantval))
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ */
+
+GLOBAL(void)
+jpeg_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
+ FAST_FLOAT z5, z10, z11, z12, z13;
+ JCOEFPTR inptr;
+ FLOAT_MULT_TYPE *quantptr;
+ FAST_FLOAT *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int ctr;
+ FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */
+#define _0_125 ((FLOAT_MULT_TYPE)0.125)
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (FLOAT_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = DCTSIZE; ctr > 0; ctr--) {
+ /* Due to quantization, we will usually find that many of the input
+ * coefficients are zero, especially the AC terms. We can exploit this
+ * by short-circuiting the IDCT calculation for any column in which all
+ * the AC terms are zero. In that case each output is equal to the
+ * DC coefficient (with scale factor as needed).
+ * With typical images and quantization tables, half or more of the
+ * column DCT calculations can be simplified this way.
+ */
+
+ if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+ inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+ inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+ inptr[DCTSIZE * 7] == 0) {
+ /* AC terms all zero */
+ FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE * 0],
+ quantptr[DCTSIZE * 0] * _0_125);
+
+ wsptr[DCTSIZE * 0] = dcval;
+ wsptr[DCTSIZE * 1] = dcval;
+ wsptr[DCTSIZE * 2] = dcval;
+ wsptr[DCTSIZE * 3] = dcval;
+ wsptr[DCTSIZE * 4] = dcval;
+ wsptr[DCTSIZE * 5] = dcval;
+ wsptr[DCTSIZE * 6] = dcval;
+ wsptr[DCTSIZE * 7] = dcval;
+
+ inptr++; /* advance pointers to next column */
+ quantptr++;
+ wsptr++;
+ continue;
+ }
+
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0] * _0_125);
+ tmp1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2] * _0_125);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4] * _0_125);
+ tmp3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6] * _0_125);
+
+ tmp10 = tmp0 + tmp2; /* phase 3 */
+ tmp11 = tmp0 - tmp2;
+
+ tmp13 = tmp1 + tmp3; /* phases 5-3 */
+ tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT)1.414213562) - tmp13; /* 2*c4 */
+
+ tmp0 = tmp10 + tmp13; /* phase 2 */
+ tmp3 = tmp10 - tmp13;
+ tmp1 = tmp11 + tmp12;
+ tmp2 = tmp11 - tmp12;
+
+ /* Odd part */
+
+ tmp4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1] * _0_125);
+ tmp5 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3] * _0_125);
+ tmp6 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5] * _0_125);
+ tmp7 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7] * _0_125);
+
+ z13 = tmp6 + tmp5; /* phase 6 */
+ z10 = tmp6 - tmp5;
+ z11 = tmp4 + tmp7;
+ z12 = tmp4 - tmp7;
+
+ tmp7 = z11 + z13; /* phase 5 */
+ tmp11 = (z11 - z13) * ((FAST_FLOAT)1.414213562); /* 2*c4 */
+
+ z5 = (z10 + z12) * ((FAST_FLOAT)1.847759065); /* 2*c2 */
+ tmp10 = z5 - z12 * ((FAST_FLOAT)1.082392200); /* 2*(c2-c6) */
+ tmp12 = z5 - z10 * ((FAST_FLOAT)2.613125930); /* 2*(c2+c6) */
+
+ tmp6 = tmp12 - tmp7; /* phase 2 */
+ tmp5 = tmp11 - tmp6;
+ tmp4 = tmp10 - tmp5;
+
+ wsptr[DCTSIZE * 0] = tmp0 + tmp7;
+ wsptr[DCTSIZE * 7] = tmp0 - tmp7;
+ wsptr[DCTSIZE * 1] = tmp1 + tmp6;
+ wsptr[DCTSIZE * 6] = tmp1 - tmp6;
+ wsptr[DCTSIZE * 2] = tmp2 + tmp5;
+ wsptr[DCTSIZE * 5] = tmp2 - tmp5;
+ wsptr[DCTSIZE * 3] = tmp3 + tmp4;
+ wsptr[DCTSIZE * 4] = tmp3 - tmp4;
+
+ inptr++; /* advance pointers to next column */
+ quantptr++;
+ wsptr++;
+ }
+
+ /* Pass 2: process rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < DCTSIZE; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+ /* Rows of zeroes can be exploited in the same way as we did with columns.
+ * However, the column calculation has created many nonzero AC terms, so
+ * the simplification applies less often (typically 5% to 10% of the time).
+ * And testing floats for zero is relatively expensive, so we don't bother.
+ */
+
+ /* Even part */
+
+ /* Apply signed->unsigned and prepare float->int conversion */
+ z5 = wsptr[0] + ((FAST_FLOAT)CENTERJSAMPLE + (FAST_FLOAT)0.5);
+ tmp10 = z5 + wsptr[4];
+ tmp11 = z5 - wsptr[4];
+
+ tmp13 = wsptr[2] + wsptr[6];
+ tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT)1.414213562) - tmp13;
+
+ tmp0 = tmp10 + tmp13;
+ tmp3 = tmp10 - tmp13;
+ tmp1 = tmp11 + tmp12;
+ tmp2 = tmp11 - tmp12;
+
+ /* Odd part */
+
+ z13 = wsptr[5] + wsptr[3];
+ z10 = wsptr[5] - wsptr[3];
+ z11 = wsptr[1] + wsptr[7];
+ z12 = wsptr[1] - wsptr[7];
+
+ tmp7 = z11 + z13;
+ tmp11 = (z11 - z13) * ((FAST_FLOAT)1.414213562);
+
+ z5 = (z10 + z12) * ((FAST_FLOAT)1.847759065); /* 2*c2 */
+ tmp10 = z5 - z12 * ((FAST_FLOAT)1.082392200); /* 2*(c2-c6) */
+ tmp12 = z5 - z10 * ((FAST_FLOAT)2.613125930); /* 2*(c2+c6) */
+
+ tmp6 = tmp12 - tmp7;
+ tmp5 = tmp11 - tmp6;
+ tmp4 = tmp10 - tmp5;
+
+ /* Final output stage: float->int conversion and range-limit */
+
+ outptr[0] = range_limit[((int)(tmp0 + tmp7)) & RANGE_MASK];
+ outptr[7] = range_limit[((int)(tmp0 - tmp7)) & RANGE_MASK];
+ outptr[1] = range_limit[((int)(tmp1 + tmp6)) & RANGE_MASK];
+ outptr[6] = range_limit[((int)(tmp1 - tmp6)) & RANGE_MASK];
+ outptr[2] = range_limit[((int)(tmp2 + tmp5)) & RANGE_MASK];
+ outptr[5] = range_limit[((int)(tmp2 - tmp5)) & RANGE_MASK];
+ outptr[3] = range_limit[((int)(tmp3 + tmp4)) & RANGE_MASK];
+ outptr[4] = range_limit[((int)(tmp3 - tmp4)) & RANGE_MASK];
+
+ wsptr += DCTSIZE; /* advance pointer to next row */
+ }
+}
+
+#endif /* DCT_FLOAT_SUPPORTED */
diff --git a/media/libjpeg/jidctfst.c b/media/libjpeg/jidctfst.c
new file mode 100644
index 0000000000..89a20c937b
--- /dev/null
+++ b/media/libjpeg/jidctfst.c
@@ -0,0 +1,371 @@
+/*
+ * jidctfst.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1998, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains a fast, not so accurate integer implementation of the
+ * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
+ * must also perform dequantization of the input coefficients.
+ *
+ * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
+ * on each row (or vice versa, but it's more convenient to emit a row at
+ * a time). Direct algorithms are also available, but they are much more
+ * complex and seem not to be any faster when reduced to code.
+ *
+ * This implementation is based on Arai, Agui, and Nakajima's algorithm for
+ * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
+ * Japanese, but the algorithm is described in the Pennebaker & Mitchell
+ * JPEG textbook (see REFERENCES section in file README.ijg). The following
+ * code is based directly on figure 4-8 in P&M.
+ * While an 8-point DCT cannot be done in less than 11 multiplies, it is
+ * possible to arrange the computation so that many of the multiplies are
+ * simple scalings of the final outputs. These multiplies can then be
+ * folded into the multiplications or divisions by the JPEG quantization
+ * table entries. The AA&N method leaves only 5 multiplies and 29 adds
+ * to be done in the DCT itself.
+ * The primary disadvantage of this method is that with fixed-point math,
+ * accuracy is lost due to imprecise representation of the scaled
+ * quantization values. The smaller the quantization table entry, the less
+ * precise the scaled value, so this implementation does worse with high-
+ * quality-setting files than with low-quality ones.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h" /* Private declarations for DCT subsystem */
+
+#ifdef DCT_IFAST_SUPPORTED
+
+
+/*
+ * This module is specialized to the case DCTSIZE = 8.
+ */
+
+#if DCTSIZE != 8
+ Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+#endif
+
+
+/* Scaling decisions are generally the same as in the LL&M algorithm;
+ * see jidctint.c for more details. However, we choose to descale
+ * (right shift) multiplication products as soon as they are formed,
+ * rather than carrying additional fractional bits into subsequent additions.
+ * This compromises accuracy slightly, but it lets us save a few shifts.
+ * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
+ * everywhere except in the multiplications proper; this saves a good deal
+ * of work on 16-bit-int machines.
+ *
+ * The dequantized coefficients are not integers because the AA&N scaling
+ * factors have been incorporated. We represent them scaled up by PASS1_BITS,
+ * so that the first and second IDCT rounds have the same input scaling.
+ * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
+ * avoid a descaling shift; this compromises accuracy rather drastically
+ * for small quantization table entries, but it saves a lot of shifts.
+ * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
+ * so we use a much larger scaling factor to preserve accuracy.
+ *
+ * A final compromise is to represent the multiplicative constants to only
+ * 8 fractional bits, rather than 13. This saves some shifting work on some
+ * machines, and may also reduce the cost of multiplication (since there
+ * are fewer one-bits in the constants).
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define CONST_BITS 8
+#define PASS1_BITS 2
+#else
+#define CONST_BITS 8
+#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
+#endif
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+ * causing a lot of useless floating-point operations at run time.
+ * To get around this we use the following pre-calculated constants.
+ * If you change CONST_BITS you may want to add appropriate values.
+ * (With a reasonable C compiler, you can just rely on the FIX() macro...)
+ */
+
+#if CONST_BITS == 8
+#define FIX_1_082392200 ((JLONG)277) /* FIX(1.082392200) */
+#define FIX_1_414213562 ((JLONG)362) /* FIX(1.414213562) */
+#define FIX_1_847759065 ((JLONG)473) /* FIX(1.847759065) */
+#define FIX_2_613125930 ((JLONG)669) /* FIX(2.613125930) */
+#else
+#define FIX_1_082392200 FIX(1.082392200)
+#define FIX_1_414213562 FIX(1.414213562)
+#define FIX_1_847759065 FIX(1.847759065)
+#define FIX_2_613125930 FIX(2.613125930)
+#endif
+
+
+/* We can gain a little more speed, with a further compromise in accuracy,
+ * by omitting the addition in a descaling shift. This yields an incorrectly
+ * rounded result half the time...
+ */
+
+#ifndef USE_ACCURATE_ROUNDING
+#undef DESCALE
+#define DESCALE(x, n) RIGHT_SHIFT(x, n)
+#endif
+
+
+/* Multiply a DCTELEM variable by an JLONG constant, and immediately
+ * descale to yield a DCTELEM result.
+ */
+
+#define MULTIPLY(var, const) ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
+
+
+/* Dequantize a coefficient by multiplying it by the multiplier-table
+ * entry; produce a DCTELEM result. For 8-bit data a 16x16->16
+ * multiplication will do. For 12-bit data, the multiplier table is
+ * declared JLONG, so a 32-bit multiply will be used.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define DEQUANTIZE(coef, quantval) (((IFAST_MULT_TYPE)(coef)) * (quantval))
+#else
+#define DEQUANTIZE(coef, quantval) \
+ DESCALE((coef) * (quantval), IFAST_SCALE_BITS - PASS1_BITS)
+#endif
+
+
+/* Like DESCALE, but applies to a DCTELEM and produces an int.
+ * We assume that int right shift is unsigned if JLONG right shift is.
+ */
+
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS DCTELEM ishift_temp;
+#if BITS_IN_JSAMPLE == 8
+#define DCTELEMBITS 16 /* DCTELEM may be 16 or 32 bits */
+#else
+#define DCTELEMBITS 32 /* DCTELEM must be 32 bits */
+#endif
+#define IRIGHT_SHIFT(x, shft) \
+ ((ishift_temp = (x)) < 0 ? \
+ (ishift_temp >> (shft)) | ((~((DCTELEM)0)) << (DCTELEMBITS - (shft))) : \
+ (ishift_temp >> (shft)))
+#else
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x, shft) ((x) >> (shft))
+#endif
+
+#ifdef USE_ACCURATE_ROUNDING
+#define IDESCALE(x, n) ((int)IRIGHT_SHIFT((x) + (1 << ((n) - 1)), n))
+#else
+#define IDESCALE(x, n) ((int)IRIGHT_SHIFT(x, n))
+#endif
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ */
+
+GLOBAL(void)
+jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ DCTELEM tmp10, tmp11, tmp12, tmp13;
+ DCTELEM z5, z10, z11, z12, z13;
+ JCOEFPTR inptr;
+ IFAST_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[DCTSIZE2]; /* buffers data between passes */
+ SHIFT_TEMPS /* for DESCALE */
+ ISHIFT_TEMPS /* for IDESCALE */
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = DCTSIZE; ctr > 0; ctr--) {
+ /* Due to quantization, we will usually find that many of the input
+ * coefficients are zero, especially the AC terms. We can exploit this
+ * by short-circuiting the IDCT calculation for any column in which all
+ * the AC terms are zero. In that case each output is equal to the
+ * DC coefficient (with scale factor as needed).
+ * With typical images and quantization tables, half or more of the
+ * column DCT calculations can be simplified this way.
+ */
+
+ if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+ inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+ inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+ inptr[DCTSIZE * 7] == 0) {
+ /* AC terms all zero */
+ int dcval = (int)DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+
+ wsptr[DCTSIZE * 0] = dcval;
+ wsptr[DCTSIZE * 1] = dcval;
+ wsptr[DCTSIZE * 2] = dcval;
+ wsptr[DCTSIZE * 3] = dcval;
+ wsptr[DCTSIZE * 4] = dcval;
+ wsptr[DCTSIZE * 5] = dcval;
+ wsptr[DCTSIZE * 6] = dcval;
+ wsptr[DCTSIZE * 7] = dcval;
+
+ inptr++; /* advance pointers to next column */
+ quantptr++;
+ wsptr++;
+ continue;
+ }
+
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ tmp3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
+
+ tmp10 = tmp0 + tmp2; /* phase 3 */
+ tmp11 = tmp0 - tmp2;
+
+ tmp13 = tmp1 + tmp3; /* phases 5-3 */
+ tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
+
+ tmp0 = tmp10 + tmp13; /* phase 2 */
+ tmp3 = tmp10 - tmp13;
+ tmp1 = tmp11 + tmp12;
+ tmp2 = tmp11 - tmp12;
+
+ /* Odd part */
+
+ tmp4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ tmp5 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ tmp6 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ tmp7 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+
+ z13 = tmp6 + tmp5; /* phase 6 */
+ z10 = tmp6 - tmp5;
+ z11 = tmp4 + tmp7;
+ z12 = tmp4 - tmp7;
+
+ tmp7 = z11 + z13; /* phase 5 */
+ tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
+
+ z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
+ tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
+ tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; /* -2*(c2+c6) */
+
+ tmp6 = tmp12 - tmp7; /* phase 2 */
+ tmp5 = tmp11 - tmp6;
+ tmp4 = tmp10 + tmp5;
+
+ wsptr[DCTSIZE * 0] = (int)(tmp0 + tmp7);
+ wsptr[DCTSIZE * 7] = (int)(tmp0 - tmp7);
+ wsptr[DCTSIZE * 1] = (int)(tmp1 + tmp6);
+ wsptr[DCTSIZE * 6] = (int)(tmp1 - tmp6);
+ wsptr[DCTSIZE * 2] = (int)(tmp2 + tmp5);
+ wsptr[DCTSIZE * 5] = (int)(tmp2 - tmp5);
+ wsptr[DCTSIZE * 4] = (int)(tmp3 + tmp4);
+ wsptr[DCTSIZE * 3] = (int)(tmp3 - tmp4);
+
+ inptr++; /* advance pointers to next column */
+ quantptr++;
+ wsptr++;
+ }
+
+ /* Pass 2: process rows from work array, store into output array. */
+ /* Note that we must descale the results by a factor of 8 == 2**3, */
+ /* and also undo the PASS1_BITS scaling. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < DCTSIZE; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+ /* Rows of zeroes can be exploited in the same way as we did with columns.
+ * However, the column calculation has created many nonzero AC terms, so
+ * the simplification applies less often (typically 5% to 10% of the time).
+ * On machines with very fast multiplication, it's possible that the
+ * test takes more time than it's worth. In that case this section
+ * may be commented out.
+ */
+
+#ifndef NO_ZERO_ROW_TEST
+ if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
+ wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
+ /* AC terms all zero */
+ JSAMPLE dcval =
+ range_limit[IDESCALE(wsptr[0], PASS1_BITS + 3) & RANGE_MASK];
+
+ outptr[0] = dcval;
+ outptr[1] = dcval;
+ outptr[2] = dcval;
+ outptr[3] = dcval;
+ outptr[4] = dcval;
+ outptr[5] = dcval;
+ outptr[6] = dcval;
+ outptr[7] = dcval;
+
+ wsptr += DCTSIZE; /* advance pointer to next row */
+ continue;
+ }
+#endif
+
+ /* Even part */
+
+ tmp10 = ((DCTELEM)wsptr[0] + (DCTELEM)wsptr[4]);
+ tmp11 = ((DCTELEM)wsptr[0] - (DCTELEM)wsptr[4]);
+
+ tmp13 = ((DCTELEM)wsptr[2] + (DCTELEM)wsptr[6]);
+ tmp12 =
+ MULTIPLY((DCTELEM)wsptr[2] - (DCTELEM)wsptr[6], FIX_1_414213562) - tmp13;
+
+ tmp0 = tmp10 + tmp13;
+ tmp3 = tmp10 - tmp13;
+ tmp1 = tmp11 + tmp12;
+ tmp2 = tmp11 - tmp12;
+
+ /* Odd part */
+
+ z13 = (DCTELEM)wsptr[5] + (DCTELEM)wsptr[3];
+ z10 = (DCTELEM)wsptr[5] - (DCTELEM)wsptr[3];
+ z11 = (DCTELEM)wsptr[1] + (DCTELEM)wsptr[7];
+ z12 = (DCTELEM)wsptr[1] - (DCTELEM)wsptr[7];
+
+ tmp7 = z11 + z13; /* phase 5 */
+ tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
+
+ z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
+ tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
+ tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; /* -2*(c2+c6) */
+
+ tmp6 = tmp12 - tmp7; /* phase 2 */
+ tmp5 = tmp11 - tmp6;
+ tmp4 = tmp10 + tmp5;
+
+ /* Final output stage: scale down by a factor of 8 and range-limit */
+
+ outptr[0] =
+ range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[7] =
+ range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[1] =
+ range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[6] =
+ range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[2] =
+ range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[5] =
+ range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[4] =
+ range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS + 3) & RANGE_MASK];
+ outptr[3] =
+ range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS + 3) & RANGE_MASK];
+
+ wsptr += DCTSIZE; /* advance pointer to next row */
+ }
+}
+
+#endif /* DCT_IFAST_SUPPORTED */
diff --git a/media/libjpeg/jidctint.c b/media/libjpeg/jidctint.c
new file mode 100644
index 0000000000..bb08748019
--- /dev/null
+++ b/media/libjpeg/jidctint.c
@@ -0,0 +1,2627 @@
+/*
+ * jidctint.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modification developed 2002-2018 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, 2020, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains a slower but more accurate integer implementation of the
+ * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
+ * must also perform dequantization of the input coefficients.
+ *
+ * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
+ * on each row (or vice versa, but it's more convenient to emit a row at
+ * a time). Direct algorithms are also available, but they are much more
+ * complex and seem not to be any faster when reduced to code.
+ *
+ * This implementation is based on an algorithm described in
+ * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
+ * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
+ * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
+ * The primary algorithm described there uses 11 multiplies and 29 adds.
+ * We use their alternate method with 12 multiplies and 32 adds.
+ * The advantage of this method is that no data path contains more than one
+ * multiplication; this allows a very simple and accurate implementation in
+ * scaled fixed-point arithmetic, with a minimal number of shifts.
+ *
+ * We also provide IDCT routines with various output sample block sizes for
+ * direct resolution reduction or enlargement without additional resampling:
+ * NxN (N=1...16) pixels for one 8x8 input DCT block.
+ *
+ * For N<8 we simply take the corresponding low-frequency coefficients of
+ * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
+ * to yield the downscaled outputs.
+ * This can be seen as direct low-pass downsampling from the DCT domain
+ * point of view rather than the usual spatial domain point of view,
+ * yielding significant computational savings and results at least
+ * as good as common bilinear (averaging) spatial downsampling.
+ *
+ * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
+ * lower frequencies and higher frequencies assumed to be zero.
+ * It turns out that the computational effort is similar to the 8x8 IDCT
+ * regarding the output size.
+ * Furthermore, the scaling and descaling is the same for all IDCT sizes.
+ *
+ * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
+ * since there would be too many additional constants to pre-calculate.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h" /* Private declarations for DCT subsystem */
+
+#ifdef DCT_ISLOW_SUPPORTED
+
+
+/*
+ * This module is specialized to the case DCTSIZE = 8.
+ */
+
+#if DCTSIZE != 8
+ Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
+#endif
+
+
+/*
+ * The poop on this scaling stuff is as follows:
+ *
+ * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
+ * larger than the true IDCT outputs. The final outputs are therefore
+ * a factor of N larger than desired; since N=8 this can be cured by
+ * a simple right shift at the end of the algorithm. The advantage of
+ * this arrangement is that we save two multiplications per 1-D IDCT,
+ * because the y0 and y4 inputs need not be divided by sqrt(N).
+ *
+ * We have to do addition and subtraction of the integer inputs, which
+ * is no problem, and multiplication by fractional constants, which is
+ * a problem to do in integer arithmetic. We multiply all the constants
+ * by CONST_SCALE and convert them to integer constants (thus retaining
+ * CONST_BITS bits of precision in the constants). After doing a
+ * multiplication we have to divide the product by CONST_SCALE, with proper
+ * rounding, to produce the correct output. This division can be done
+ * cheaply as a right shift of CONST_BITS bits. We postpone shifting
+ * as long as possible so that partial sums can be added together with
+ * full fractional precision.
+ *
+ * The outputs of the first pass are scaled up by PASS1_BITS bits so that
+ * they are represented to better-than-integral precision. These outputs
+ * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
+ * with the recommended scaling. (To scale up 12-bit sample data further, an
+ * intermediate JLONG array would be needed.)
+ *
+ * To avoid overflow of the 32-bit intermediate results in pass 2, we must
+ * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
+ * shows that the values given below are the most effective.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#else
+#define CONST_BITS 13
+#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
+#endif
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+ * causing a lot of useless floating-point operations at run time.
+ * To get around this we use the following pre-calculated constants.
+ * If you change CONST_BITS you may want to add appropriate values.
+ * (With a reasonable C compiler, you can just rely on the FIX() macro...)
+ */
+
+#if CONST_BITS == 13
+#define FIX_0_298631336 ((JLONG)2446) /* FIX(0.298631336) */
+#define FIX_0_390180644 ((JLONG)3196) /* FIX(0.390180644) */
+#define FIX_0_541196100 ((JLONG)4433) /* FIX(0.541196100) */
+#define FIX_0_765366865 ((JLONG)6270) /* FIX(0.765366865) */
+#define FIX_0_899976223 ((JLONG)7373) /* FIX(0.899976223) */
+#define FIX_1_175875602 ((JLONG)9633) /* FIX(1.175875602) */
+#define FIX_1_501321110 ((JLONG)12299) /* FIX(1.501321110) */
+#define FIX_1_847759065 ((JLONG)15137) /* FIX(1.847759065) */
+#define FIX_1_961570560 ((JLONG)16069) /* FIX(1.961570560) */
+#define FIX_2_053119869 ((JLONG)16819) /* FIX(2.053119869) */
+#define FIX_2_562915447 ((JLONG)20995) /* FIX(2.562915447) */
+#define FIX_3_072711026 ((JLONG)25172) /* FIX(3.072711026) */
+#else
+#define FIX_0_298631336 FIX(0.298631336)
+#define FIX_0_390180644 FIX(0.390180644)
+#define FIX_0_541196100 FIX(0.541196100)
+#define FIX_0_765366865 FIX(0.765366865)
+#define FIX_0_899976223 FIX(0.899976223)
+#define FIX_1_175875602 FIX(1.175875602)
+#define FIX_1_501321110 FIX(1.501321110)
+#define FIX_1_847759065 FIX(1.847759065)
+#define FIX_1_961570560 FIX(1.961570560)
+#define FIX_2_053119869 FIX(2.053119869)
+#define FIX_2_562915447 FIX(2.562915447)
+#define FIX_3_072711026 FIX(3.072711026)
+#endif
+
+
+/* Multiply an JLONG variable by an JLONG constant to yield an JLONG result.
+ * For 8-bit samples with the recommended scaling, all the variable
+ * and constant values involved are no more than 16 bits wide, so a
+ * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
+ * For 12-bit samples, a full 32-bit multiplication will be needed.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define MULTIPLY(var, const) MULTIPLY16C16(var, const)
+#else
+#define MULTIPLY(var, const) ((var) * (const))
+#endif
+
+
+/* Dequantize a coefficient by multiplying it by the multiplier-table
+ * entry; produce an int result. In this module, both inputs and result
+ * are 16 bits or less, so either int or short multiply will work.
+ */
+
+#define DEQUANTIZE(coef, quantval) (((ISLOW_MULT_TYPE)(coef)) * (quantval))
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ */
+
+GLOBAL(void)
+jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp0, tmp1, tmp2, tmp3;
+ JLONG tmp10, tmp11, tmp12, tmp13;
+ JLONG z1, z2, z3, z4, z5;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[DCTSIZE2]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+ /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = DCTSIZE; ctr > 0; ctr--) {
+ /* Due to quantization, we will usually find that many of the input
+ * coefficients are zero, especially the AC terms. We can exploit this
+ * by short-circuiting the IDCT calculation for any column in which all
+ * the AC terms are zero. In that case each output is equal to the
+ * DC coefficient (with scale factor as needed).
+ * With typical images and quantization tables, half or more of the
+ * column DCT calculations can be simplified this way.
+ */
+
+ if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+ inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+ inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+ inptr[DCTSIZE * 7] == 0) {
+ /* AC terms all zero */
+ int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+ quantptr[DCTSIZE * 0]), PASS1_BITS);
+
+ wsptr[DCTSIZE * 0] = dcval;
+ wsptr[DCTSIZE * 1] = dcval;
+ wsptr[DCTSIZE * 2] = dcval;
+ wsptr[DCTSIZE * 3] = dcval;
+ wsptr[DCTSIZE * 4] = dcval;
+ wsptr[DCTSIZE * 5] = dcval;
+ wsptr[DCTSIZE * 6] = dcval;
+ wsptr[DCTSIZE * 7] = dcval;
+
+ inptr++; /* advance pointers to next column */
+ quantptr++;
+ wsptr++;
+ continue;
+ }
+
+ /* Even part: reverse the even part of the forward DCT. */
+ /* The rotator is sqrt(2)*c(-6). */
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
+
+ z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+ tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
+ tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+
+ tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS);
+ tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS);
+
+ tmp10 = tmp0 + tmp3;
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+ tmp1 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ tmp3 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+
+ z1 = tmp0 + tmp3;
+ z2 = tmp1 + tmp2;
+ z3 = tmp0 + tmp2;
+ z4 = tmp1 + tmp3;
+ z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+
+ tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+ z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+ z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
+
+ z3 += z5;
+ z4 += z5;
+
+ tmp0 += z1 + z3;
+ tmp1 += z2 + z4;
+ tmp2 += z2 + z3;
+ tmp3 += z1 + z4;
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ wsptr[DCTSIZE * 0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[DCTSIZE * 4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS - PASS1_BITS);
+
+ inptr++; /* advance pointers to next column */
+ quantptr++;
+ wsptr++;
+ }
+
+ /* Pass 2: process rows from work array, store into output array. */
+ /* Note that we must descale the results by a factor of 8 == 2**3, */
+ /* and also undo the PASS1_BITS scaling. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < DCTSIZE; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+ /* Rows of zeroes can be exploited in the same way as we did with columns.
+ * However, the column calculation has created many nonzero AC terms, so
+ * the simplification applies less often (typically 5% to 10% of the time).
+ * On machines with very fast multiplication, it's possible that the
+ * test takes more time than it's worth. In that case this section
+ * may be commented out.
+ */
+
+#ifndef NO_ZERO_ROW_TEST
+ if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
+ wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
+ /* AC terms all zero */
+ JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+ PASS1_BITS + 3) & RANGE_MASK];
+
+ outptr[0] = dcval;
+ outptr[1] = dcval;
+ outptr[2] = dcval;
+ outptr[3] = dcval;
+ outptr[4] = dcval;
+ outptr[5] = dcval;
+ outptr[6] = dcval;
+ outptr[7] = dcval;
+
+ wsptr += DCTSIZE; /* advance pointer to next row */
+ continue;
+ }
+#endif
+
+ /* Even part: reverse the even part of the forward DCT. */
+ /* The rotator is sqrt(2)*c(-6). */
+
+ z2 = (JLONG)wsptr[2];
+ z3 = (JLONG)wsptr[6];
+
+ z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+ tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
+ tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+ tmp0 = LEFT_SHIFT((JLONG)wsptr[0] + (JLONG)wsptr[4], CONST_BITS);
+ tmp1 = LEFT_SHIFT((JLONG)wsptr[0] - (JLONG)wsptr[4], CONST_BITS);
+
+ tmp10 = tmp0 + tmp3;
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
+ tmp0 = (JLONG)wsptr[7];
+ tmp1 = (JLONG)wsptr[5];
+ tmp2 = (JLONG)wsptr[3];
+ tmp3 = (JLONG)wsptr[1];
+
+ z1 = tmp0 + tmp3;
+ z2 = tmp1 + tmp2;
+ z3 = tmp0 + tmp2;
+ z4 = tmp1 + tmp3;
+ z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+
+ tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+ z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+ z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
+
+ z3 += z5;
+ z4 += z5;
+
+ tmp0 += z1 + z3;
+ tmp1 += z2 + z4;
+ tmp2 += z2 + z3;
+ tmp3 += z1 + z4;
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp3,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)DESCALE(tmp10 - tmp3,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)DESCALE(tmp11 + tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)DESCALE(tmp11 - tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)DESCALE(tmp12 + tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)DESCALE(tmp12 - tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)DESCALE(tmp13 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)DESCALE(tmp13 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += DCTSIZE; /* advance pointer to next row */
+ }
+}
+
+#ifdef IDCT_SCALING_SUPPORTED
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 7x7 output block.
+ *
+ * Optimized algorithm with 12 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/14).
+ */
+
+GLOBAL(void)
+jpeg_idct_7x7(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
+ JLONG z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[7 * 7]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp13 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
+ /* Add fudge factor here for final descale. */
+ tmp13 += ONE << (CONST_BITS - PASS1_BITS - 1);
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
+
+ tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */
+ tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */
+ tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+ tmp0 = z1 + z3;
+ z2 -= tmp0;
+ tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
+ tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */
+ tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */
+ tmp13 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+
+ tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */
+ tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */
+ tmp0 = tmp1 - tmp2;
+ tmp1 += tmp2;
+ tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276)); /* -c1 */
+ tmp1 += tmp2;
+ z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */
+ tmp0 += z2;
+ tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */
+
+ /* Final output stage */
+
+ wsptr[7 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[7 * 6] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[7 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[7 * 5] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[7 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[7 * 4] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[7 * 3] = (int)RIGHT_SHIFT(tmp13, CONST_BITS - PASS1_BITS);
+ }
+
+ /* Pass 2: process 7 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 7; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp13 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
+ tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
+
+ z1 = (JLONG)wsptr[2];
+ z2 = (JLONG)wsptr[4];
+ z3 = (JLONG)wsptr[6];
+
+ tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */
+ tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */
+ tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+ tmp0 = z1 + z3;
+ z2 -= tmp0;
+ tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
+ tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */
+ tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */
+ tmp13 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */
+
+ /* Odd part */
+
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+
+ tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */
+ tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */
+ tmp0 = tmp1 - tmp2;
+ tmp1 += tmp2;
+ tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276)); /* -c1 */
+ tmp1 += tmp2;
+ z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */
+ tmp0 += z2;
+ tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += 7; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 6x6 output block.
+ *
+ * Optimized algorithm with 3 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/12).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+ JLONG z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[6 * 6]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */
+ tmp1 = tmp0 + tmp10;
+ tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS - PASS1_BITS);
+ tmp10 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */
+ tmp10 = tmp1 + tmp0;
+ tmp12 = tmp1 - tmp0;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+ tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
+ tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
+ tmp1 = LEFT_SHIFT(z1 - z2 - z3, PASS1_BITS);
+
+ /* Final output stage */
+
+ wsptr[6 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[6 * 5] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[6 * 1] = (int)(tmp11 + tmp1);
+ wsptr[6 * 4] = (int)(tmp11 - tmp1);
+ wsptr[6 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[6 * 3] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
+ }
+
+ /* Pass 2: process 6 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 6; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
+ tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
+ tmp2 = (JLONG)wsptr[4];
+ tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */
+ tmp1 = tmp0 + tmp10;
+ tmp11 = tmp0 - tmp10 - tmp10;
+ tmp10 = (JLONG)wsptr[2];
+ tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */
+ tmp10 = tmp1 + tmp0;
+ tmp12 = tmp1 - tmp0;
+
+ /* Odd part */
+
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+ tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
+ tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
+ tmp1 = LEFT_SHIFT(z1 - z2 - z3, CONST_BITS);
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += 6; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 5x5 output block.
+ *
+ * Optimized algorithm with 5 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/10).
+ */
+
+GLOBAL(void)
+jpeg_idct_5x5(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp0, tmp1, tmp10, tmp11, tmp12;
+ JLONG z1, z2, z3;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[5 * 5]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp12 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
+ /* Add fudge factor here for final descale. */
+ tmp12 += ONE << (CONST_BITS - PASS1_BITS - 1);
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ tmp1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
+ z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
+ z3 = tmp12 + z2;
+ tmp10 = z3 + z1;
+ tmp11 = z3 - z1;
+ tmp12 -= LEFT_SHIFT(z2, 2);
+
+ /* Odd part */
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+
+ z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */
+ tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */
+ tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */
+
+ /* Final output stage */
+
+ wsptr[5 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[5 * 4] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[5 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[5 * 3] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[5 * 2] = (int)RIGHT_SHIFT(tmp12, CONST_BITS - PASS1_BITS);
+ }
+
+ /* Pass 2: process 5 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 5; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp12 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
+ tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
+ tmp0 = (JLONG)wsptr[2];
+ tmp1 = (JLONG)wsptr[4];
+ z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
+ z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
+ z3 = tmp12 + z2;
+ tmp10 = z3 + z1;
+ tmp11 = z3 - z1;
+ tmp12 -= LEFT_SHIFT(z2, 2);
+
+ /* Odd part */
+
+ z2 = (JLONG)wsptr[1];
+ z3 = (JLONG)wsptr[3];
+
+ z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */
+ tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */
+ tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += 5; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 3x3 output block.
+ *
+ * Optimized algorithm with 2 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/6).
+ */
+
+GLOBAL(void)
+jpeg_idct_3x3(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp0, tmp2, tmp10, tmp12;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[3 * 3]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
+ tmp2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+ tmp10 = tmp0 + tmp12;
+ tmp2 = tmp0 - tmp12 - tmp12;
+
+ /* Odd part */
+
+ tmp12 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+ /* Final output stage */
+
+ wsptr[3 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[3 * 2] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[3 * 1] = (int)RIGHT_SHIFT(tmp2, CONST_BITS - PASS1_BITS);
+ }
+
+ /* Pass 2: process 3 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 3; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
+ tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
+ tmp2 = (JLONG)wsptr[2];
+ tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+ tmp10 = tmp0 + tmp12;
+ tmp2 = tmp0 - tmp12 - tmp12;
+
+ /* Odd part */
+
+ tmp12 = (JLONG)wsptr[1];
+ tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += 3; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 9x9 output block.
+ *
+ * Optimized algorithm with 10 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/18).
+ */
+
+GLOBAL(void)
+jpeg_idct_9x9(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
+ JLONG z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8 * 9]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
+
+ tmp3 = MULTIPLY(z3, FIX(0.707106781)); /* c6 */
+ tmp1 = tmp0 + tmp3;
+ tmp2 = tmp0 - tmp3 - tmp3;
+
+ tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
+ tmp11 = tmp2 + tmp0;
+ tmp14 = tmp2 - tmp0 - tmp0;
+
+ tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
+ tmp2 = MULTIPLY(z1, FIX(1.083350441)); /* c4 */
+ tmp3 = MULTIPLY(z2, FIX(0.245575608)); /* c8 */
+
+ tmp10 = tmp1 + tmp0 - tmp3;
+ tmp12 = tmp1 - tmp0 + tmp2;
+ tmp13 = tmp1 - tmp2 + tmp3;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+
+ z2 = MULTIPLY(z2, -FIX(1.224744871)); /* -c3 */
+
+ tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955)); /* c5 */
+ tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525)); /* c7 */
+ tmp0 = tmp2 + tmp3 - z2;
+ tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481)); /* c1 */
+ tmp2 += z2 - tmp1;
+ tmp3 += z2 + tmp1;
+ tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
+
+ /* Final output stage */
+
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp14, CONST_BITS - PASS1_BITS);
+ }
+
+ /* Pass 2: process 9 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 9; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
+ tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
+
+ z1 = (JLONG)wsptr[2];
+ z2 = (JLONG)wsptr[4];
+ z3 = (JLONG)wsptr[6];
+
+ tmp3 = MULTIPLY(z3, FIX(0.707106781)); /* c6 */
+ tmp1 = tmp0 + tmp3;
+ tmp2 = tmp0 - tmp3 - tmp3;
+
+ tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
+ tmp11 = tmp2 + tmp0;
+ tmp14 = tmp2 - tmp0 - tmp0;
+
+ tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
+ tmp2 = MULTIPLY(z1, FIX(1.083350441)); /* c4 */
+ tmp3 = MULTIPLY(z2, FIX(0.245575608)); /* c8 */
+
+ tmp10 = tmp1 + tmp0 - tmp3;
+ tmp12 = tmp1 - tmp0 + tmp2;
+ tmp13 = tmp1 - tmp2 + tmp3;
+
+ /* Odd part */
+
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z4 = (JLONG)wsptr[7];
+
+ z2 = MULTIPLY(z2, -FIX(1.224744871)); /* -c3 */
+
+ tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955)); /* c5 */
+ tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525)); /* c7 */
+ tmp0 = tmp2 + tmp3 - z2;
+ tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481)); /* c1 */
+ tmp2 += z2 - tmp1;
+ tmp3 += z2 + tmp1;
+ tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13 + tmp3,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp13 - tmp3,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 10x10 output block.
+ *
+ * Optimized algorithm with 12 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/20).
+ */
+
+GLOBAL(void)
+jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
+ JLONG tmp20, tmp21, tmp22, tmp23, tmp24;
+ JLONG z1, z2, z3, z4, z5;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8 * 10]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ z3 = LEFT_SHIFT(z3, CONST_BITS);
+ /* Add fudge factor here for final descale. */
+ z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z1 = MULTIPLY(z4, FIX(1.144122806)); /* c4 */
+ z2 = MULTIPLY(z4, FIX(0.437016024)); /* c8 */
+ tmp10 = z3 + z1;
+ tmp11 = z3 - z2;
+
+ tmp22 = RIGHT_SHIFT(z3 - LEFT_SHIFT(z1 - z2, 1),
+ CONST_BITS - PASS1_BITS); /* c0 = (c4-c8)*2 */
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
+
+ z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */
+ tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+ tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+ tmp20 = tmp10 + tmp12;
+ tmp24 = tmp10 - tmp12;
+ tmp21 = tmp11 + tmp13;
+ tmp23 = tmp11 - tmp13;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+
+ tmp11 = z2 + z4;
+ tmp13 = z2 - z4;
+
+ tmp12 = MULTIPLY(tmp13, FIX(0.309016994)); /* (c3-c7)/2 */
+ z5 = LEFT_SHIFT(z3, CONST_BITS);
+
+ z2 = MULTIPLY(tmp11, FIX(0.951056516)); /* (c3+c7)/2 */
+ z4 = z5 + tmp12;
+
+ tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+ tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+ z2 = MULTIPLY(tmp11, FIX(0.587785252)); /* (c1-c9)/2 */
+ z4 = z5 - tmp12 - LEFT_SHIFT(tmp13, CONST_BITS - 1);
+
+ tmp12 = LEFT_SHIFT(z1 - tmp13 - z3, PASS1_BITS);
+
+ tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+ tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+ /* Final output stage */
+
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)(tmp22 + tmp12);
+ wsptr[8 * 7] = (int)(tmp22 - tmp12);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+ }
+
+ /* Pass 2: process 10 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 10; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
+ z3 = LEFT_SHIFT(z3, CONST_BITS);
+ z4 = (JLONG)wsptr[4];
+ z1 = MULTIPLY(z4, FIX(1.144122806)); /* c4 */
+ z2 = MULTIPLY(z4, FIX(0.437016024)); /* c8 */
+ tmp10 = z3 + z1;
+ tmp11 = z3 - z2;
+
+ tmp22 = z3 - LEFT_SHIFT(z1 - z2, 1); /* c0 = (c4-c8)*2 */
+
+ z2 = (JLONG)wsptr[2];
+ z3 = (JLONG)wsptr[6];
+
+ z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */
+ tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+ tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+ tmp20 = tmp10 + tmp12;
+ tmp24 = tmp10 - tmp12;
+ tmp21 = tmp11 + tmp13;
+ tmp23 = tmp11 - tmp13;
+
+ /* Odd part */
+
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z3 = LEFT_SHIFT(z3, CONST_BITS);
+ z4 = (JLONG)wsptr[7];
+
+ tmp11 = z2 + z4;
+ tmp13 = z2 - z4;
+
+ tmp12 = MULTIPLY(tmp13, FIX(0.309016994)); /* (c3-c7)/2 */
+
+ z2 = MULTIPLY(tmp11, FIX(0.951056516)); /* (c3+c7)/2 */
+ z4 = z3 + tmp12;
+
+ tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+ tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+ z2 = MULTIPLY(tmp11, FIX(0.587785252)); /* (c1-c9)/2 */
+ z4 = z3 - tmp12 - LEFT_SHIFT(tmp13, CONST_BITS - 1);
+
+ tmp12 = LEFT_SHIFT(z1 - tmp13, CONST_BITS) - z3;
+
+ tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+ tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing an 11x11 output block.
+ *
+ * Optimized algorithm with 24 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/22).
+ */
+
+GLOBAL(void)
+jpeg_idct_11x11(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
+ JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+ JLONG z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8 * 11]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp10 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
+ /* Add fudge factor here for final descale. */
+ tmp10 += ONE << (CONST_BITS - PASS1_BITS - 1);
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
+
+ tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132)); /* c2+c4 */
+ tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045)); /* c2-c6 */
+ z4 = z1 + z3;
+ tmp24 = MULTIPLY(z4, -FIX(1.155664402)); /* -(c2-c10) */
+ z4 -= z2;
+ tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */
+ tmp21 = tmp20 + tmp23 + tmp25 -
+ MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */
+ tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
+ tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
+ tmp24 += tmp25;
+ tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120)); /* c8+c10 */
+ tmp24 += MULTIPLY(z2, FIX(1.944413522)) - /* c2+c8 */
+ MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */
+ tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562)); /* c0 */
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+
+ tmp11 = z1 + z2;
+ tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
+ tmp11 = MULTIPLY(tmp11, FIX(0.887983902)); /* c3-c9 */
+ tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295)); /* c5-c9 */
+ tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
+ tmp10 = tmp11 + tmp12 + tmp13 -
+ MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */
+ z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
+ tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */
+ tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */
+ z1 = MULTIPLY(z2 + z4, -FIX(1.798248910)); /* -(c1+c9) */
+ tmp11 += z1;
+ tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */
+ tmp14 += MULTIPLY(z2, -FIX(1.467221301)) + /* -(c5+c9) */
+ MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */
+ MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */
+
+ /* Final output stage */
+
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp25, CONST_BITS - PASS1_BITS);
+ }
+
+ /* Pass 2: process 11 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 11; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp10 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
+ tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
+
+ z1 = (JLONG)wsptr[2];
+ z2 = (JLONG)wsptr[4];
+ z3 = (JLONG)wsptr[6];
+
+ tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132)); /* c2+c4 */
+ tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045)); /* c2-c6 */
+ z4 = z1 + z3;
+ tmp24 = MULTIPLY(z4, -FIX(1.155664402)); /* -(c2-c10) */
+ z4 -= z2;
+ tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */
+ tmp21 = tmp20 + tmp23 + tmp25 -
+ MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */
+ tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
+ tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
+ tmp24 += tmp25;
+ tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120)); /* c8+c10 */
+ tmp24 += MULTIPLY(z2, FIX(1.944413522)) - /* c2+c8 */
+ MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */
+ tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562)); /* c0 */
+
+ /* Odd part */
+
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z4 = (JLONG)wsptr[7];
+
+ tmp11 = z1 + z2;
+ tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
+ tmp11 = MULTIPLY(tmp11, FIX(0.887983902)); /* c3-c9 */
+ tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295)); /* c5-c9 */
+ tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
+ tmp10 = tmp11 + tmp12 + tmp13 -
+ MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */
+ z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
+ tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */
+ tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */
+ z1 = MULTIPLY(z2 + z4, -FIX(1.798248910)); /* -(c1+c9) */
+ tmp11 += z1;
+ tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */
+ tmp14 += MULTIPLY(z2, -FIX(1.467221301)) + /* -(c5+c9) */
+ MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */
+ MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp25,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 12x12 output block.
+ *
+ * Optimized algorithm with 15 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/24).
+ */
+
+GLOBAL(void)
+jpeg_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+ JLONG z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8 * 12]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ z3 = LEFT_SHIFT(z3, CONST_BITS);
+ /* Add fudge factor here for final descale. */
+ z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
+
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+ tmp10 = z3 + z4;
+ tmp11 = z3 - z4;
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+ z1 = LEFT_SHIFT(z1, CONST_BITS);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
+ z2 = LEFT_SHIFT(z2, CONST_BITS);
+
+ tmp12 = z1 - z2;
+
+ tmp21 = z3 + tmp12;
+ tmp24 = z3 - tmp12;
+
+ tmp12 = z4 + z2;
+
+ tmp20 = tmp10 + tmp12;
+ tmp25 = tmp10 - tmp12;
+
+ tmp12 = z4 - z1 - z2;
+
+ tmp22 = tmp11 + tmp12;
+ tmp23 = tmp11 - tmp12;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+
+ tmp11 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */
+ tmp14 = MULTIPLY(z2, -FIX_0_541196100); /* -c9 */
+
+ tmp10 = z1 + z3;
+ tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669)); /* c7 */
+ tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384)); /* c5-c7 */
+ tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */
+ tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580)); /* -(c7+c11) */
+ tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+ tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+ tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */
+ MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */
+
+ z1 -= z4;
+ z2 -= z3;
+ z3 = MULTIPLY(z1 + z2, FIX_0_541196100); /* c9 */
+ tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865); /* c3-c9 */
+ tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065); /* c3+c9 */
+
+ /* Final output stage */
+
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+ }
+
+ /* Pass 2: process 12 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 12; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
+ z3 = LEFT_SHIFT(z3, CONST_BITS);
+
+ z4 = (JLONG)wsptr[4];
+ z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+ tmp10 = z3 + z4;
+ tmp11 = z3 - z4;
+
+ z1 = (JLONG)wsptr[2];
+ z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+ z1 = LEFT_SHIFT(z1, CONST_BITS);
+ z2 = (JLONG)wsptr[6];
+ z2 = LEFT_SHIFT(z2, CONST_BITS);
+
+ tmp12 = z1 - z2;
+
+ tmp21 = z3 + tmp12;
+ tmp24 = z3 - tmp12;
+
+ tmp12 = z4 + z2;
+
+ tmp20 = tmp10 + tmp12;
+ tmp25 = tmp10 - tmp12;
+
+ tmp12 = z4 - z1 - z2;
+
+ tmp22 = tmp11 + tmp12;
+ tmp23 = tmp11 - tmp12;
+
+ /* Odd part */
+
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z4 = (JLONG)wsptr[7];
+
+ tmp11 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */
+ tmp14 = MULTIPLY(z2, -FIX_0_541196100); /* -c9 */
+
+ tmp10 = z1 + z3;
+ tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669)); /* c7 */
+ tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384)); /* c5-c7 */
+ tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */
+ tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580)); /* -(c7+c11) */
+ tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+ tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+ tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */
+ MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */
+
+ z1 -= z4;
+ z2 -= z3;
+ z3 = MULTIPLY(z1 + z2, FIX_0_541196100); /* c9 */
+ tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865); /* c3-c9 */
+ tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065); /* c3+c9 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 13x13 output block.
+ *
+ * Optimized algorithm with 29 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/26).
+ */
+
+GLOBAL(void)
+jpeg_idct_13x13(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+ JLONG z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8 * 13]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ z1 = LEFT_SHIFT(z1, CONST_BITS);
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
+
+ tmp10 = z3 + z4;
+ tmp11 = z3 - z4;
+
+ tmp12 = MULTIPLY(tmp10, FIX(1.155388986)); /* (c4+c6)/2 */
+ tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1; /* (c4-c6)/2 */
+
+ tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13; /* c2 */
+ tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13; /* c10 */
+
+ tmp12 = MULTIPLY(tmp10, FIX(0.316450131)); /* (c8-c12)/2 */
+ tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1; /* (c8+c12)/2 */
+
+ tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13; /* c6 */
+ tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+
+ tmp12 = MULTIPLY(tmp10, FIX(0.435816023)); /* (c2-c10)/2 */
+ tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1; /* (c2+c10)/2 */
+
+ tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
+ tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+
+ tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1; /* c0 */
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+
+ tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651)); /* c3 */
+ tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945)); /* c5 */
+ tmp15 = z1 + z4;
+ tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */
+ tmp10 = tmp11 + tmp12 + tmp13 -
+ MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */
+ tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458)); /* -c11 */
+ tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
+ tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
+ tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945)); /* -c5 */
+ tmp11 += tmp14;
+ tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
+ tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813)); /* -c9 */
+ tmp12 += tmp14;
+ tmp13 += tmp14;
+ tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */
+ tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
+ MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */
+ z1 = MULTIPLY(z3 - z2, FIX(0.937797057)); /* c7 */
+ tmp14 += z1;
+ tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) - /* c3-c7 */
+ MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */
+
+ /* Final output stage */
+
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp26, CONST_BITS - PASS1_BITS);
+ }
+
+ /* Pass 2: process 13 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 13; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
+ z1 = LEFT_SHIFT(z1, CONST_BITS);
+
+ z2 = (JLONG)wsptr[2];
+ z3 = (JLONG)wsptr[4];
+ z4 = (JLONG)wsptr[6];
+
+ tmp10 = z3 + z4;
+ tmp11 = z3 - z4;
+
+ tmp12 = MULTIPLY(tmp10, FIX(1.155388986)); /* (c4+c6)/2 */
+ tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1; /* (c4-c6)/2 */
+
+ tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13; /* c2 */
+ tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13; /* c10 */
+
+ tmp12 = MULTIPLY(tmp10, FIX(0.316450131)); /* (c8-c12)/2 */
+ tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1; /* (c8+c12)/2 */
+
+ tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13; /* c6 */
+ tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+
+ tmp12 = MULTIPLY(tmp10, FIX(0.435816023)); /* (c2-c10)/2 */
+ tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1; /* (c2+c10)/2 */
+
+ tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
+ tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+
+ tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1; /* c0 */
+
+ /* Odd part */
+
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z4 = (JLONG)wsptr[7];
+
+ tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651)); /* c3 */
+ tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945)); /* c5 */
+ tmp15 = z1 + z4;
+ tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */
+ tmp10 = tmp11 + tmp12 + tmp13 -
+ MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */
+ tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458)); /* -c11 */
+ tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
+ tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
+ tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945)); /* -c5 */
+ tmp11 += tmp14;
+ tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
+ tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813)); /* -c9 */
+ tmp12 += tmp14;
+ tmp13 += tmp14;
+ tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */
+ tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
+ MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */
+ z1 = MULTIPLY(z3 - z2, FIX(0.937797057)); /* c7 */
+ tmp14 += z1;
+ tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) - /* c3-c7 */
+ MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp26,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 14x14 output block.
+ *
+ * Optimized algorithm with 20 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/28).
+ */
+
+GLOBAL(void)
+jpeg_idct_14x14(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+ JLONG z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8 * 14]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ z1 = LEFT_SHIFT(z1, CONST_BITS);
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */
+ z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */
+ z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */
+
+ tmp10 = z1 + z2;
+ tmp11 = z1 + z3;
+ tmp12 = z1 - z4;
+
+ tmp23 = RIGHT_SHIFT(z1 - LEFT_SHIFT(z2 + z3 - z4, 1),
+ CONST_BITS - PASS1_BITS); /* c0 = (c4+c12-c8)*2 */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
+
+ z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */
+
+ tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+ tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+ tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */
+ MULTIPLY(z2, FIX(1.378756276)); /* c2 */
+
+ tmp20 = tmp10 + tmp13;
+ tmp26 = tmp10 - tmp13;
+ tmp21 = tmp11 + tmp14;
+ tmp25 = tmp11 - tmp14;
+ tmp22 = tmp12 + tmp15;
+ tmp24 = tmp12 - tmp15;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+ tmp13 = LEFT_SHIFT(z4, CONST_BITS);
+
+ tmp14 = z1 + z3;
+ tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */
+ tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */
+ tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+ tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */
+ tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */
+ z1 -= z2;
+ tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13; /* c11 */
+ tmp16 += tmp15;
+ z1 += z4;
+ z4 = MULTIPLY(z2 + z3, -FIX(0.158341681)) - tmp13; /* -c13 */
+ tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */
+ tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */
+ z4 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */
+ tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+ tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */
+
+ tmp13 = LEFT_SHIFT(z1 - z3, PASS1_BITS);
+
+ /* Final output stage */
+
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)(tmp23 + tmp13);
+ wsptr[8 * 10] = (int)(tmp23 - tmp13);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
+ }
+
+ /* Pass 2: process 14 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 14; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
+ z1 = LEFT_SHIFT(z1, CONST_BITS);
+ z4 = (JLONG)wsptr[4];
+ z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */
+ z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */
+ z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */
+
+ tmp10 = z1 + z2;
+ tmp11 = z1 + z3;
+ tmp12 = z1 - z4;
+
+ tmp23 = z1 - LEFT_SHIFT(z2 + z3 - z4, 1); /* c0 = (c4+c12-c8)*2 */
+
+ z1 = (JLONG)wsptr[2];
+ z2 = (JLONG)wsptr[6];
+
+ z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */
+
+ tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+ tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+ tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */
+ MULTIPLY(z2, FIX(1.378756276)); /* c2 */
+
+ tmp20 = tmp10 + tmp13;
+ tmp26 = tmp10 - tmp13;
+ tmp21 = tmp11 + tmp14;
+ tmp25 = tmp11 - tmp14;
+ tmp22 = tmp12 + tmp15;
+ tmp24 = tmp12 - tmp15;
+
+ /* Odd part */
+
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z4 = (JLONG)wsptr[7];
+ z4 = LEFT_SHIFT(z4, CONST_BITS);
+
+ tmp14 = z1 + z3;
+ tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */
+ tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */
+ tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+ tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */
+ tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */
+ z1 -= z2;
+ tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4; /* c11 */
+ tmp16 += tmp15;
+ tmp13 = MULTIPLY(z2 + z3, -FIX(0.158341681)) - z4; /* -c13 */
+ tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */
+ tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */
+ tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */
+ tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+ tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */
+
+ tmp13 = LEFT_SHIFT(z1 - z3, CONST_BITS) + z4;
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 15x15 output block.
+ *
+ * Optimized algorithm with 22 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/30).
+ */
+
+GLOBAL(void)
+jpeg_idct_15x15(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+ JLONG z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8 * 15]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ z1 = LEFT_SHIFT(z1, CONST_BITS);
+ /* Add fudge factor here for final descale. */
+ z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
+
+ tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
+ tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
+
+ tmp12 = z1 - tmp10;
+ tmp13 = z1 + tmp11;
+ z1 -= LEFT_SHIFT(tmp11 - tmp10, 1); /* c0 = (c6-c12)*2 */
+
+ z4 = z2 - z3;
+ z3 += z2;
+ tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
+ tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
+ z2 = MULTIPLY(z2, FIX(1.439773946)); /* c4+c14 */
+
+ tmp20 = tmp13 + tmp10 + tmp11;
+ tmp23 = tmp12 - tmp10 + tmp11 + z2;
+
+ tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
+ tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
+
+ tmp25 = tmp13 - tmp10 - tmp11;
+ tmp26 = tmp12 + tmp10 - tmp11 - z2;
+
+ tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
+ tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
+
+ tmp21 = tmp12 + tmp10 + tmp11;
+ tmp24 = tmp13 - tmp10 + tmp11;
+ tmp11 += tmp11;
+ tmp22 = z1 + tmp11; /* c10 = c6-c12 */
+ tmp27 = z1 - tmp11 - tmp11; /* c0 = (c6-c12)*2 */
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z3 = MULTIPLY(z4, FIX(1.224744871)); /* c5 */
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+
+ tmp13 = z2 - z4;
+ tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876)); /* c9 */
+ tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148)); /* c3-c9 */
+ tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899)); /* c3+c9 */
+
+ tmp13 = MULTIPLY(z2, -FIX(0.831253876)); /* -c9 */
+ tmp15 = MULTIPLY(z2, -FIX(1.344997024)); /* -c3 */
+ z2 = z1 - z4;
+ tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353)); /* c1 */
+
+ tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
+ tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
+ tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3; /* c5 */
+ z2 = MULTIPLY(z1 + z4, FIX(0.575212477)); /* c11 */
+ tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3; /* c7-c11 */
+ tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3; /* c11+c13 */
+
+ /* Final output stage */
+
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp27, CONST_BITS - PASS1_BITS);
+ }
+
+ /* Pass 2: process 15 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 15; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
+ z1 = LEFT_SHIFT(z1, CONST_BITS);
+
+ z2 = (JLONG)wsptr[2];
+ z3 = (JLONG)wsptr[4];
+ z4 = (JLONG)wsptr[6];
+
+ tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
+ tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
+
+ tmp12 = z1 - tmp10;
+ tmp13 = z1 + tmp11;
+ z1 -= LEFT_SHIFT(tmp11 - tmp10, 1); /* c0 = (c6-c12)*2 */
+
+ z4 = z2 - z3;
+ z3 += z2;
+ tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
+ tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
+ z2 = MULTIPLY(z2, FIX(1.439773946)); /* c4+c14 */
+
+ tmp20 = tmp13 + tmp10 + tmp11;
+ tmp23 = tmp12 - tmp10 + tmp11 + z2;
+
+ tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
+ tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
+
+ tmp25 = tmp13 - tmp10 - tmp11;
+ tmp26 = tmp12 + tmp10 - tmp11 - z2;
+
+ tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
+ tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
+
+ tmp21 = tmp12 + tmp10 + tmp11;
+ tmp24 = tmp13 - tmp10 + tmp11;
+ tmp11 += tmp11;
+ tmp22 = z1 + tmp11; /* c10 = c6-c12 */
+ tmp27 = z1 - tmp11 - tmp11; /* c0 = (c6-c12)*2 */
+
+ /* Odd part */
+
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z4 = (JLONG)wsptr[5];
+ z3 = MULTIPLY(z4, FIX(1.224744871)); /* c5 */
+ z4 = (JLONG)wsptr[7];
+
+ tmp13 = z2 - z4;
+ tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876)); /* c9 */
+ tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148)); /* c3-c9 */
+ tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899)); /* c3+c9 */
+
+ tmp13 = MULTIPLY(z2, -FIX(0.831253876)); /* -c9 */
+ tmp15 = MULTIPLY(z2, -FIX(1.344997024)); /* -c3 */
+ z2 = z1 - z4;
+ tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353)); /* c1 */
+
+ tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
+ tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
+ tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3; /* c5 */
+ z2 = MULTIPLY(z1 + z4, FIX(0.575212477)); /* c11 */
+ tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3; /* c7-c11 */
+ tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3; /* c11+c13 */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp27,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 16x16 output block.
+ *
+ * Optimized algorithm with 28 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/32).
+ */
+
+GLOBAL(void)
+jpeg_idct_16x16(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+ JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+ JLONG z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[8 * 16]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
+ /* Add fudge factor here for final descale. */
+ tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+ tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
+ tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
+
+ tmp10 = tmp0 + tmp1;
+ tmp11 = tmp0 - tmp1;
+ tmp12 = tmp0 + tmp2;
+ tmp13 = tmp0 - tmp2;
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
+ z3 = z1 - z2;
+ z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
+ z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
+
+ tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */
+ tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */
+ tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+ tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+ tmp20 = tmp10 + tmp0;
+ tmp27 = tmp10 - tmp0;
+ tmp21 = tmp12 + tmp1;
+ tmp26 = tmp12 - tmp1;
+ tmp22 = tmp13 + tmp2;
+ tmp25 = tmp13 - tmp2;
+ tmp23 = tmp11 + tmp3;
+ tmp24 = tmp11 - tmp3;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+
+ tmp11 = z1 + z3;
+
+ tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */
+ tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */
+ tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */
+ tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */
+ tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */
+ tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */
+ tmp0 = tmp1 + tmp2 + tmp3 -
+ MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */
+ tmp13 = tmp10 + tmp11 + tmp12 -
+ MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */
+ z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */
+ tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */
+ tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */
+ z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */
+ tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */
+ tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */
+ z2 += z4;
+ z1 = MULTIPLY(z2, -FIX(0.666655658)); /* -c11 */
+ tmp1 += z1;
+ tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */
+ z2 = MULTIPLY(z2, -FIX(1.247225013)); /* -c5 */
+ tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */
+ tmp12 += z2;
+ z2 = MULTIPLY(z3 + z4, -FIX(1.353318001)); /* -c3 */
+ tmp2 += z2;
+ tmp3 += z2;
+ z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */
+ tmp10 += z2;
+ tmp11 += z2;
+
+ /* Final output stage */
+
+ wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 15] = (int)RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS - PASS1_BITS);
+ wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS - PASS1_BITS);
+ }
+
+ /* Pass 2: process 16 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 16; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+
+ /* Add fudge factor here for final descale. */
+ tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
+ tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
+
+ z1 = (JLONG)wsptr[4];
+ tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
+ tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
+
+ tmp10 = tmp0 + tmp1;
+ tmp11 = tmp0 - tmp1;
+ tmp12 = tmp0 + tmp2;
+ tmp13 = tmp0 - tmp2;
+
+ z1 = (JLONG)wsptr[2];
+ z2 = (JLONG)wsptr[6];
+ z3 = z1 - z2;
+ z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
+ z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
+
+ tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */
+ tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */
+ tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+ tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+ tmp20 = tmp10 + tmp0;
+ tmp27 = tmp10 - tmp0;
+ tmp21 = tmp12 + tmp1;
+ tmp26 = tmp12 - tmp1;
+ tmp22 = tmp13 + tmp2;
+ tmp25 = tmp13 - tmp2;
+ tmp23 = tmp11 + tmp3;
+ tmp24 = tmp11 - tmp3;
+
+ /* Odd part */
+
+ z1 = (JLONG)wsptr[1];
+ z2 = (JLONG)wsptr[3];
+ z3 = (JLONG)wsptr[5];
+ z4 = (JLONG)wsptr[7];
+
+ tmp11 = z1 + z3;
+
+ tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */
+ tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */
+ tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */
+ tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */
+ tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */
+ tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */
+ tmp0 = tmp1 + tmp2 + tmp3 -
+ MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */
+ tmp13 = tmp10 + tmp11 + tmp12 -
+ MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */
+ z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */
+ tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */
+ tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */
+ z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */
+ tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */
+ tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */
+ z2 += z4;
+ z1 = MULTIPLY(z2, -FIX(0.666655658)); /* -c11 */
+ tmp1 += z1;
+ tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */
+ z2 = MULTIPLY(z2, -FIX(1.247225013)); /* -c5 */
+ tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */
+ tmp12 += z2;
+ z2 = MULTIPLY(z3 + z4, -FIX(1.353318001)); /* -c3 */
+ tmp2 += z2;
+ tmp3 += z2;
+ z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */
+ tmp10 += z2;
+ tmp11 += z2;
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[15] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp0,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp1,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp2,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp3,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp3,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp10,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp11,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp12,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp27 + tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+ outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp27 - tmp13,
+ CONST_BITS + PASS1_BITS + 3) &
+ RANGE_MASK];
+
+ wsptr += 8; /* advance pointer to next row */
+ }
+}
+
+#endif /* IDCT_SCALING_SUPPORTED */
+#endif /* DCT_ISLOW_SUPPORTED */
diff --git a/media/libjpeg/jidctred.c b/media/libjpeg/jidctred.c
new file mode 100644
index 0000000000..1dd65a94d9
--- /dev/null
+++ b/media/libjpeg/jidctred.c
@@ -0,0 +1,409 @@
+/*
+ * jidctred.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1998, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains inverse-DCT routines that produce reduced-size output:
+ * either 4x4, 2x2, or 1x1 pixels from an 8x8 DCT block.
+ *
+ * The implementation is based on the Loeffler, Ligtenberg and Moschytz (LL&M)
+ * algorithm used in jidctint.c. We simply replace each 8-to-8 1-D IDCT step
+ * with an 8-to-4 step that produces the four averages of two adjacent outputs
+ * (or an 8-to-2 step producing two averages of four outputs, for 2x2 output).
+ * These steps were derived by computing the corresponding values at the end
+ * of the normal LL&M code, then simplifying as much as possible.
+ *
+ * 1x1 is trivial: just take the DC coefficient divided by 8.
+ *
+ * See jidctint.c for additional comments.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h" /* Private declarations for DCT subsystem */
+
+#ifdef IDCT_SCALING_SUPPORTED
+
+
+/*
+ * This module is specialized to the case DCTSIZE = 8.
+ */
+
+#if DCTSIZE != 8
+ Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+#endif
+
+
+/* Scaling is the same as in jidctint.c. */
+
+#if BITS_IN_JSAMPLE == 8
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#else
+#define CONST_BITS 13
+#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
+#endif
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+ * causing a lot of useless floating-point operations at run time.
+ * To get around this we use the following pre-calculated constants.
+ * If you change CONST_BITS you may want to add appropriate values.
+ * (With a reasonable C compiler, you can just rely on the FIX() macro...)
+ */
+
+#if CONST_BITS == 13
+#define FIX_0_211164243 ((JLONG)1730) /* FIX(0.211164243) */
+#define FIX_0_509795579 ((JLONG)4176) /* FIX(0.509795579) */
+#define FIX_0_601344887 ((JLONG)4926) /* FIX(0.601344887) */
+#define FIX_0_720959822 ((JLONG)5906) /* FIX(0.720959822) */
+#define FIX_0_765366865 ((JLONG)6270) /* FIX(0.765366865) */
+#define FIX_0_850430095 ((JLONG)6967) /* FIX(0.850430095) */
+#define FIX_0_899976223 ((JLONG)7373) /* FIX(0.899976223) */
+#define FIX_1_061594337 ((JLONG)8697) /* FIX(1.061594337) */
+#define FIX_1_272758580 ((JLONG)10426) /* FIX(1.272758580) */
+#define FIX_1_451774981 ((JLONG)11893) /* FIX(1.451774981) */
+#define FIX_1_847759065 ((JLONG)15137) /* FIX(1.847759065) */
+#define FIX_2_172734803 ((JLONG)17799) /* FIX(2.172734803) */
+#define FIX_2_562915447 ((JLONG)20995) /* FIX(2.562915447) */
+#define FIX_3_624509785 ((JLONG)29692) /* FIX(3.624509785) */
+#else
+#define FIX_0_211164243 FIX(0.211164243)
+#define FIX_0_509795579 FIX(0.509795579)
+#define FIX_0_601344887 FIX(0.601344887)
+#define FIX_0_720959822 FIX(0.720959822)
+#define FIX_0_765366865 FIX(0.765366865)
+#define FIX_0_850430095 FIX(0.850430095)
+#define FIX_0_899976223 FIX(0.899976223)
+#define FIX_1_061594337 FIX(1.061594337)
+#define FIX_1_272758580 FIX(1.272758580)
+#define FIX_1_451774981 FIX(1.451774981)
+#define FIX_1_847759065 FIX(1.847759065)
+#define FIX_2_172734803 FIX(2.172734803)
+#define FIX_2_562915447 FIX(2.562915447)
+#define FIX_3_624509785 FIX(3.624509785)
+#endif
+
+
+/* Multiply a JLONG variable by a JLONG constant to yield a JLONG result.
+ * For 8-bit samples with the recommended scaling, all the variable
+ * and constant values involved are no more than 16 bits wide, so a
+ * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
+ * For 12-bit samples, a full 32-bit multiplication will be needed.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define MULTIPLY(var, const) MULTIPLY16C16(var, const)
+#else
+#define MULTIPLY(var, const) ((var) * (const))
+#endif
+
+
+/* Dequantize a coefficient by multiplying it by the multiplier-table
+ * entry; produce an int result. In this module, both inputs and result
+ * are 16 bits or less, so either int or short multiply will work.
+ */
+
+#define DEQUANTIZE(coef, quantval) (((ISLOW_MULT_TYPE)(coef)) * (quantval))
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 4x4 output block.
+ */
+
+GLOBAL(void)
+jpeg_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp0, tmp2, tmp10, tmp12;
+ JLONG z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[DCTSIZE * 4]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
+ /* Don't bother to process column 4, because second pass won't use it */
+ if (ctr == DCTSIZE - 4)
+ continue;
+ if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+ inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 5] == 0 &&
+ inptr[DCTSIZE * 6] == 0 && inptr[DCTSIZE * 7] == 0) {
+ /* AC terms all zero; we need not examine term 4 for 4x4 output */
+ int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+ quantptr[DCTSIZE * 0]), PASS1_BITS);
+
+ wsptr[DCTSIZE * 0] = dcval;
+ wsptr[DCTSIZE * 1] = dcval;
+ wsptr[DCTSIZE * 2] = dcval;
+ wsptr[DCTSIZE * 3] = dcval;
+
+ continue;
+ }
+
+ /* Even part */
+
+ tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp0 = LEFT_SHIFT(tmp0, CONST_BITS + 1);
+
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
+
+ tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, -FIX_0_765366865);
+
+ tmp10 = tmp0 + tmp2;
+ tmp12 = tmp0 - tmp2;
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+ z2 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ z3 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ z4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+
+ tmp0 = MULTIPLY(z1, -FIX_0_211164243) + /* sqrt(2) * ( c3-c1) */
+ MULTIPLY(z2, FIX_1_451774981) + /* sqrt(2) * ( c3+c7) */
+ MULTIPLY(z3, -FIX_2_172734803) + /* sqrt(2) * (-c1-c5) */
+ MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * ( c5+c7) */
+
+ tmp2 = MULTIPLY(z1, -FIX_0_509795579) + /* sqrt(2) * (c7-c5) */
+ MULTIPLY(z2, -FIX_0_601344887) + /* sqrt(2) * (c5-c1) */
+ MULTIPLY(z3, FIX_0_899976223) + /* sqrt(2) * (c3-c7) */
+ MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
+
+ /* Final output stage */
+
+ wsptr[DCTSIZE * 0] =
+ (int)DESCALE(tmp10 + tmp2, CONST_BITS - PASS1_BITS + 1);
+ wsptr[DCTSIZE * 3] =
+ (int)DESCALE(tmp10 - tmp2, CONST_BITS - PASS1_BITS + 1);
+ wsptr[DCTSIZE * 1] =
+ (int)DESCALE(tmp12 + tmp0, CONST_BITS - PASS1_BITS + 1);
+ wsptr[DCTSIZE * 2] =
+ (int)DESCALE(tmp12 - tmp0, CONST_BITS - PASS1_BITS + 1);
+ }
+
+ /* Pass 2: process 4 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 4; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+ /* It's not clear whether a zero row test is worthwhile here ... */
+
+#ifndef NO_ZERO_ROW_TEST
+ if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 &&
+ wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
+ /* AC terms all zero */
+ JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+ PASS1_BITS + 3) & RANGE_MASK];
+
+ outptr[0] = dcval;
+ outptr[1] = dcval;
+ outptr[2] = dcval;
+ outptr[3] = dcval;
+
+ wsptr += DCTSIZE; /* advance pointer to next row */
+ continue;
+ }
+#endif
+
+ /* Even part */
+
+ tmp0 = LEFT_SHIFT((JLONG)wsptr[0], CONST_BITS + 1);
+
+ tmp2 = MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) +
+ MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865);
+
+ tmp10 = tmp0 + tmp2;
+ tmp12 = tmp0 - tmp2;
+
+ /* Odd part */
+
+ z1 = (JLONG)wsptr[7];
+ z2 = (JLONG)wsptr[5];
+ z3 = (JLONG)wsptr[3];
+ z4 = (JLONG)wsptr[1];
+
+ tmp0 = MULTIPLY(z1, -FIX_0_211164243) + /* sqrt(2) * ( c3-c1) */
+ MULTIPLY(z2, FIX_1_451774981) + /* sqrt(2) * ( c3+c7) */
+ MULTIPLY(z3, -FIX_2_172734803) + /* sqrt(2) * (-c1-c5) */
+ MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * ( c5+c7) */
+
+ tmp2 = MULTIPLY(z1, -FIX_0_509795579) + /* sqrt(2) * (c7-c5) */
+ MULTIPLY(z2, -FIX_0_601344887) + /* sqrt(2) * (c5-c1) */
+ MULTIPLY(z3, FIX_0_899976223) + /* sqrt(2) * (c3-c7) */
+ MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp2,
+ CONST_BITS + PASS1_BITS + 3 + 1) &
+ RANGE_MASK];
+ outptr[3] = range_limit[(int)DESCALE(tmp10 - tmp2,
+ CONST_BITS + PASS1_BITS + 3 + 1) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)DESCALE(tmp12 + tmp0,
+ CONST_BITS + PASS1_BITS + 3 + 1) &
+ RANGE_MASK];
+ outptr[2] = range_limit[(int)DESCALE(tmp12 - tmp0,
+ CONST_BITS + PASS1_BITS + 3 + 1) &
+ RANGE_MASK];
+
+ wsptr += DCTSIZE; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 2x2 output block.
+ */
+
+GLOBAL(void)
+jpeg_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JLONG tmp0, tmp10, z1;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ int *wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ int workspace[DCTSIZE * 2]; /* buffers data between passes */
+ SHIFT_TEMPS
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ wsptr = workspace;
+ for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
+ /* Don't bother to process columns 2,4,6 */
+ if (ctr == DCTSIZE - 2 || ctr == DCTSIZE - 4 || ctr == DCTSIZE - 6)
+ continue;
+ if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 3] == 0 &&
+ inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 7] == 0) {
+ /* AC terms all zero; we need not examine terms 2,4,6 for 2x2 output */
+ int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+ quantptr[DCTSIZE * 0]), PASS1_BITS);
+
+ wsptr[DCTSIZE * 0] = dcval;
+ wsptr[DCTSIZE * 1] = dcval;
+
+ continue;
+ }
+
+ /* Even part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+ tmp10 = LEFT_SHIFT(z1, CONST_BITS + 2);
+
+ /* Odd part */
+
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+ tmp0 = MULTIPLY(z1, -FIX_0_720959822); /* sqrt(2) * ( c7-c5+c3-c1) */
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+ tmp0 += MULTIPLY(z1, FIX_0_850430095); /* sqrt(2) * (-c1+c3+c5+c7) */
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+ tmp0 += MULTIPLY(z1, -FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */
+ z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+ tmp0 += MULTIPLY(z1, FIX_3_624509785); /* sqrt(2) * ( c1+c3+c5+c7) */
+
+ /* Final output stage */
+
+ wsptr[DCTSIZE * 0] =
+ (int)DESCALE(tmp10 + tmp0, CONST_BITS - PASS1_BITS + 2);
+ wsptr[DCTSIZE * 1] =
+ (int)DESCALE(tmp10 - tmp0, CONST_BITS - PASS1_BITS + 2);
+ }
+
+ /* Pass 2: process 2 rows from work array, store into output array. */
+
+ wsptr = workspace;
+ for (ctr = 0; ctr < 2; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+ /* It's not clear whether a zero row test is worthwhile here ... */
+
+#ifndef NO_ZERO_ROW_TEST
+ if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) {
+ /* AC terms all zero */
+ JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+ PASS1_BITS + 3) & RANGE_MASK];
+
+ outptr[0] = dcval;
+ outptr[1] = dcval;
+
+ wsptr += DCTSIZE; /* advance pointer to next row */
+ continue;
+ }
+#endif
+
+ /* Even part */
+
+ tmp10 = LEFT_SHIFT((JLONG)wsptr[0], CONST_BITS + 2);
+
+ /* Odd part */
+
+ tmp0 = MULTIPLY((JLONG)wsptr[7], -FIX_0_720959822) + /* sqrt(2) * ( c7-c5+c3-c1) */
+ MULTIPLY((JLONG)wsptr[5], FIX_0_850430095) + /* sqrt(2) * (-c1+c3+c5+c7) */
+ MULTIPLY((JLONG)wsptr[3], -FIX_1_272758580) + /* sqrt(2) * (-c1+c3-c5-c7) */
+ MULTIPLY((JLONG)wsptr[1], FIX_3_624509785); /* sqrt(2) * ( c1+c3+c5+c7) */
+
+ /* Final output stage */
+
+ outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp0,
+ CONST_BITS + PASS1_BITS + 3 + 2) &
+ RANGE_MASK];
+ outptr[1] = range_limit[(int)DESCALE(tmp10 - tmp0,
+ CONST_BITS + PASS1_BITS + 3 + 2) &
+ RANGE_MASK];
+
+ wsptr += DCTSIZE; /* advance pointer to next row */
+ }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 1x1 output block.
+ */
+
+GLOBAL(void)
+jpeg_idct_1x1(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ int dcval;
+ ISLOW_MULT_TYPE *quantptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ SHIFT_TEMPS
+
+ /* We hardly need an inverse DCT routine for this: just take the
+ * average pixel value, which is one-eighth of the DC coefficient.
+ */
+ quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
+ dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
+ dcval = (int)DESCALE((JLONG)dcval, 3);
+
+ output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
+}
+
+#endif /* IDCT_SCALING_SUPPORTED */
diff --git a/media/libjpeg/jinclude.h b/media/libjpeg/jinclude.h
new file mode 100644
index 0000000000..e8d983ac17
--- /dev/null
+++ b/media/libjpeg/jinclude.h
@@ -0,0 +1,145 @@
+/*
+ * jinclude.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1994, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file exists to provide a single place to fix any problems with
+ * including the wrong system include files. (Common problems are taken
+ * care of by the standard jconfig symbols, but on really weird systems
+ * you may have to edit this file.)
+ *
+ * NOTE: this file is NOT intended to be included by applications using the
+ * JPEG library. Most applications need only include jpeglib.h.
+ */
+
+#ifndef __JINCLUDE_H__
+#define __JINCLUDE_H__
+
+/* Include auto-config file to find out which system include files we need. */
+
+#include "jconfig.h" /* auto configuration options */
+#include "jconfigint.h"
+#define JCONFIG_INCLUDED /* so that jpeglib.h doesn't do it again */
+
+/*
+ * Note that the core JPEG library does not require <stdio.h>;
+ * only the default error handler and data source/destination modules do.
+ * But we must pull it in because of the references to FILE in jpeglib.h.
+ * You can remove those references if you want to compile without <stdio.h>.
+ */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/*
+ * These macros/inline functions facilitate using Microsoft's "safe string"
+ * functions with Visual Studio builds without the need to scatter #ifdefs
+ * throughout the code base.
+ */
+
+
+#ifdef _MSC_VER
+
+#define SNPRINTF(str, n, format, ...) \
+ _snprintf_s(str, n, _TRUNCATE, format, ##__VA_ARGS__)
+
+#else
+
+#define SNPRINTF snprintf
+
+#endif
+
+
+#ifndef NO_GETENV
+
+#ifdef _MSC_VER
+
+static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
+{
+ size_t required_size;
+
+ return (int)getenv_s(&required_size, buffer, buffer_size, name);
+}
+
+#else /* _MSC_VER */
+
+#include <errno.h>
+
+/* This provides a similar interface to the Microsoft/C11 getenv_s() function,
+ * but other than parameter validation, it has no advantages over getenv().
+ */
+
+static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
+{
+ char *env;
+
+ if (!buffer) {
+ if (buffer_size == 0)
+ return 0;
+ else
+ return (errno = EINVAL);
+ }
+ if (buffer_size == 0)
+ return (errno = EINVAL);
+ if (!name) {
+ *buffer = 0;
+ return 0;
+ }
+
+ env = getenv(name);
+ if (!env)
+ {
+ *buffer = 0;
+ return 0;
+ }
+
+ if (strlen(env) + 1 > buffer_size) {
+ *buffer = 0;
+ return ERANGE;
+ }
+
+ strncpy(buffer, env, buffer_size);
+
+ return 0;
+}
+
+#endif /* _MSC_VER */
+
+#endif /* NO_GETENV */
+
+
+#ifndef NO_PUTENV
+
+#ifdef _WIN32
+
+#define PUTENV_S(name, value) _putenv_s(name, value)
+
+#else
+
+/* This provides a similar interface to the Microsoft _putenv_s() function, but
+ * other than parameter validation, it has no advantages over setenv().
+ */
+
+static INLINE int PUTENV_S(const char *name, const char *value)
+{
+ if (!name || !value)
+ return (errno = EINVAL);
+
+ setenv(name, value, 1);
+
+ return errno;
+}
+
+#endif /* _WIN32 */
+
+#endif /* NO_PUTENV */
+
+
+#endif /* JINCLUDE_H */
diff --git a/media/libjpeg/jmemmgr.c b/media/libjpeg/jmemmgr.c
new file mode 100644
index 0000000000..a40446f6ac
--- /dev/null
+++ b/media/libjpeg/jmemmgr.c
@@ -0,0 +1,1180 @@
+/*
+ * jmemmgr.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2016, 2021-2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the JPEG system-independent memory management
+ * routines. This code is usable across a wide variety of machines; most
+ * of the system dependencies have been isolated in a separate file.
+ * The major functions provided here are:
+ * * pool-based allocation and freeing of memory;
+ * * policy decisions about how to divide available memory among the
+ * virtual arrays;
+ * * control logic for swapping virtual arrays between main memory and
+ * backing storage.
+ * The separate system-dependent file provides the actual backing-storage
+ * access code, and it contains the policy decision about how much total
+ * main memory to use.
+ * This file is system-dependent in the sense that some of its functions
+ * are unnecessary in some systems. For example, if there is enough virtual
+ * memory so that backing storage will never be used, much of the virtual
+ * array control logic could be removed. (Of course, if you have that much
+ * memory then you shouldn't care about a little bit of unused code...)
+ */
+
+#define JPEG_INTERNALS
+#define AM_MEMORY_MANAGER /* we define jvirt_Xarray_control structs */
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jmemsys.h" /* import the system-dependent declarations */
+#if !defined(_MSC_VER) || _MSC_VER > 1600
+#include <stdint.h>
+#endif
+#include <limits.h>
+
+
+LOCAL(size_t)
+round_up_pow2(size_t a, size_t b)
+/* a rounded up to the next multiple of b, i.e. ceil(a/b)*b */
+/* Assumes a >= 0, b > 0, and b is a power of 2 */
+{
+ return ((a + b - 1) & (~(b - 1)));
+}
+
+
+/*
+ * Some important notes:
+ * The allocation routines provided here must never return NULL.
+ * They should exit to error_exit if unsuccessful.
+ *
+ * It's not a good idea to try to merge the sarray and barray routines,
+ * even though they are textually almost the same, because samples are
+ * usually stored as bytes while coefficients are shorts or ints. Thus,
+ * in machines where byte pointers have a different representation from
+ * word pointers, the resulting machine code could not be the same.
+ */
+
+
+/*
+ * Many machines require storage alignment: longs must start on 4-byte
+ * boundaries, doubles on 8-byte boundaries, etc. On such machines, malloc()
+ * always returns pointers that are multiples of the worst-case alignment
+ * requirement, and we had better do so too.
+ * There isn't any really portable way to determine the worst-case alignment
+ * requirement. This module assumes that the alignment requirement is
+ * multiples of ALIGN_SIZE.
+ * By default, we define ALIGN_SIZE as the maximum of sizeof(double) and
+ * sizeof(void *). This is necessary on some workstations (where doubles
+ * really do need 8-byte alignment) and will work fine on nearly everything.
+ * We use the maximum of sizeof(double) and sizeof(void *) since sizeof(double)
+ * may be insufficient, for example, on CHERI-enabled platforms with 16-byte
+ * pointers and a 16-byte alignment requirement. If your machine has lesser
+ * alignment needs, you can save a few bytes by making ALIGN_SIZE smaller.
+ * The only place I know of where this will NOT work is certain Macintosh
+ * 680x0 compilers that define double as a 10-byte IEEE extended float.
+ * Doing 10-byte alignment is counterproductive because longwords won't be
+ * aligned well. Put "#define ALIGN_SIZE 4" in jconfig.h if you have
+ * such a compiler.
+ */
+
+#ifndef ALIGN_SIZE /* so can override from jconfig.h */
+#ifndef WITH_SIMD
+#define ALIGN_SIZE MAX(sizeof(void *), sizeof(double))
+#else
+#define ALIGN_SIZE 32 /* Most of the SIMD instructions we support require
+ 16-byte (128-bit) alignment, but AVX2 requires
+ 32-byte alignment. */
+#endif
+#endif
+
+/*
+ * We allocate objects from "pools", where each pool is gotten with a single
+ * request to jpeg_get_small() or jpeg_get_large(). There is no per-object
+ * overhead within a pool, except for alignment padding. Each pool has a
+ * header with a link to the next pool of the same class.
+ * Small and large pool headers are identical.
+ */
+
+typedef struct small_pool_struct *small_pool_ptr;
+
+typedef struct small_pool_struct {
+ small_pool_ptr next; /* next in list of pools */
+ size_t bytes_used; /* how many bytes already used within pool */
+ size_t bytes_left; /* bytes still available in this pool */
+} small_pool_hdr;
+
+typedef struct large_pool_struct *large_pool_ptr;
+
+typedef struct large_pool_struct {
+ large_pool_ptr next; /* next in list of pools */
+ size_t bytes_used; /* how many bytes already used within pool */
+ size_t bytes_left; /* bytes still available in this pool */
+} large_pool_hdr;
+
+/*
+ * Here is the full definition of a memory manager object.
+ */
+
+typedef struct {
+ struct jpeg_memory_mgr pub; /* public fields */
+
+ /* Each pool identifier (lifetime class) names a linked list of pools. */
+ small_pool_ptr small_list[JPOOL_NUMPOOLS];
+ large_pool_ptr large_list[JPOOL_NUMPOOLS];
+
+ /* Since we only have one lifetime class of virtual arrays, only one
+ * linked list is necessary (for each datatype). Note that the virtual
+ * array control blocks being linked together are actually stored somewhere
+ * in the small-pool list.
+ */
+ jvirt_sarray_ptr virt_sarray_list;
+ jvirt_barray_ptr virt_barray_list;
+
+ /* This counts total space obtained from jpeg_get_small/large */
+ size_t total_space_allocated;
+
+ /* alloc_sarray and alloc_barray set this value for use by virtual
+ * array routines.
+ */
+ JDIMENSION last_rowsperchunk; /* from most recent alloc_sarray/barray */
+} my_memory_mgr;
+
+typedef my_memory_mgr *my_mem_ptr;
+
+
+/*
+ * The control blocks for virtual arrays.
+ * Note that these blocks are allocated in the "small" pool area.
+ * System-dependent info for the associated backing store (if any) is hidden
+ * inside the backing_store_info struct.
+ */
+
+struct jvirt_sarray_control {
+ JSAMPARRAY mem_buffer; /* => the in-memory buffer */
+ JDIMENSION rows_in_array; /* total virtual array height */
+ JDIMENSION samplesperrow; /* width of array (and of memory buffer) */
+ JDIMENSION maxaccess; /* max rows accessed by access_virt_sarray */
+ JDIMENSION rows_in_mem; /* height of memory buffer */
+ JDIMENSION rowsperchunk; /* allocation chunk size in mem_buffer */
+ JDIMENSION cur_start_row; /* first logical row # in the buffer */
+ JDIMENSION first_undef_row; /* row # of first uninitialized row */
+ boolean pre_zero; /* pre-zero mode requested? */
+ boolean dirty; /* do current buffer contents need written? */
+ boolean b_s_open; /* is backing-store data valid? */
+ jvirt_sarray_ptr next; /* link to next virtual sarray control block */
+ backing_store_info b_s_info; /* System-dependent control info */
+};
+
+struct jvirt_barray_control {
+ JBLOCKARRAY mem_buffer; /* => the in-memory buffer */
+ JDIMENSION rows_in_array; /* total virtual array height */
+ JDIMENSION blocksperrow; /* width of array (and of memory buffer) */
+ JDIMENSION maxaccess; /* max rows accessed by access_virt_barray */
+ JDIMENSION rows_in_mem; /* height of memory buffer */
+ JDIMENSION rowsperchunk; /* allocation chunk size in mem_buffer */
+ JDIMENSION cur_start_row; /* first logical row # in the buffer */
+ JDIMENSION first_undef_row; /* row # of first uninitialized row */
+ boolean pre_zero; /* pre-zero mode requested? */
+ boolean dirty; /* do current buffer contents need written? */
+ boolean b_s_open; /* is backing-store data valid? */
+ jvirt_barray_ptr next; /* link to next virtual barray control block */
+ backing_store_info b_s_info; /* System-dependent control info */
+};
+
+
+#ifdef MEM_STATS /* optional extra stuff for statistics */
+
+LOCAL(void)
+print_mem_stats(j_common_ptr cinfo, int pool_id)
+{
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
+ small_pool_ptr shdr_ptr;
+ large_pool_ptr lhdr_ptr;
+
+ /* Since this is only a debugging stub, we can cheat a little by using
+ * fprintf directly rather than going through the trace message code.
+ * This is helpful because message parm array can't handle longs.
+ */
+ fprintf(stderr, "Freeing pool %d, total space = %ld\n",
+ pool_id, mem->total_space_allocated);
+
+ for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL;
+ lhdr_ptr = lhdr_ptr->next) {
+ fprintf(stderr, " Large chunk used %ld\n", (long)lhdr_ptr->bytes_used);
+ }
+
+ for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL;
+ shdr_ptr = shdr_ptr->next) {
+ fprintf(stderr, " Small chunk used %ld free %ld\n",
+ (long)shdr_ptr->bytes_used, (long)shdr_ptr->bytes_left);
+ }
+}
+
+#endif /* MEM_STATS */
+
+
+LOCAL(void)
+out_of_memory(j_common_ptr cinfo, int which)
+/* Report an out-of-memory error and stop execution */
+/* If we compiled MEM_STATS support, report alloc requests before dying */
+{
+#ifdef MEM_STATS
+ cinfo->err->trace_level = 2; /* force self_destruct to report stats */
+#endif
+ ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, which);
+}
+
+
+/*
+ * Allocation of "small" objects.
+ *
+ * For these, we use pooled storage. When a new pool must be created,
+ * we try to get enough space for the current request plus a "slop" factor,
+ * where the slop will be the amount of leftover space in the new pool.
+ * The speed vs. space tradeoff is largely determined by the slop values.
+ * A different slop value is provided for each pool class (lifetime),
+ * and we also distinguish the first pool of a class from later ones.
+ * NOTE: the values given work fairly well on both 16- and 32-bit-int
+ * machines, but may be too small if longs are 64 bits or more.
+ *
+ * Since we do not know what alignment malloc() gives us, we have to
+ * allocate ALIGN_SIZE-1 extra space per pool to have room for alignment
+ * adjustment.
+ */
+
+static const size_t first_pool_slop[JPOOL_NUMPOOLS] = {
+ 1600, /* first PERMANENT pool */
+ 16000 /* first IMAGE pool */
+};
+
+static const size_t extra_pool_slop[JPOOL_NUMPOOLS] = {
+ 0, /* additional PERMANENT pools */
+ 5000 /* additional IMAGE pools */
+};
+
+#define MIN_SLOP 50 /* greater than 0 to avoid futile looping */
+
+
+METHODDEF(void *)
+alloc_small(j_common_ptr cinfo, int pool_id, size_t sizeofobject)
+/* Allocate a "small" object */
+{
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
+ small_pool_ptr hdr_ptr, prev_hdr_ptr;
+ char *data_ptr;
+ size_t min_request, slop;
+
+ /*
+ * Round up the requested size to a multiple of ALIGN_SIZE in order
+ * to assure alignment for the next object allocated in the same pool
+ * and so that algorithms can straddle outside the proper area up
+ * to the next alignment.
+ */
+ if (sizeofobject > MAX_ALLOC_CHUNK) {
+ /* This prevents overflow/wrap-around in round_up_pow2() if sizeofobject
+ is close to SIZE_MAX. */
+ out_of_memory(cinfo, 7);
+ }
+ sizeofobject = round_up_pow2(sizeofobject, ALIGN_SIZE);
+
+ /* Check for unsatisfiable request (do now to ensure no overflow below) */
+ if ((sizeof(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) >
+ MAX_ALLOC_CHUNK)
+ out_of_memory(cinfo, 1); /* request exceeds malloc's ability */
+
+ /* See if space is available in any existing pool */
+ if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
+ ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
+ prev_hdr_ptr = NULL;
+ hdr_ptr = mem->small_list[pool_id];
+ while (hdr_ptr != NULL) {
+ if (hdr_ptr->bytes_left >= sizeofobject)
+ break; /* found pool with enough space */
+ prev_hdr_ptr = hdr_ptr;
+ hdr_ptr = hdr_ptr->next;
+ }
+
+ /* Time to make a new pool? */
+ if (hdr_ptr == NULL) {
+ /* min_request is what we need now, slop is what will be leftover */
+ min_request = sizeof(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1;
+ if (prev_hdr_ptr == NULL) /* first pool in class? */
+ slop = first_pool_slop[pool_id];
+ else
+ slop = extra_pool_slop[pool_id];
+ /* Don't ask for more than MAX_ALLOC_CHUNK */
+ if (slop > (size_t)(MAX_ALLOC_CHUNK - min_request))
+ slop = (size_t)(MAX_ALLOC_CHUNK - min_request);
+ /* Try to get space, if fail reduce slop and try again */
+ for (;;) {
+ hdr_ptr = (small_pool_ptr)jpeg_get_small(cinfo, min_request + slop);
+ if (hdr_ptr != NULL)
+ break;
+ slop /= 2;
+ if (slop < MIN_SLOP) /* give up when it gets real small */
+ out_of_memory(cinfo, 2); /* jpeg_get_small failed */
+ }
+ mem->total_space_allocated += min_request + slop;
+ /* Success, initialize the new pool header and add to end of list */
+ hdr_ptr->next = NULL;
+ hdr_ptr->bytes_used = 0;
+ hdr_ptr->bytes_left = sizeofobject + slop;
+ if (prev_hdr_ptr == NULL) /* first pool in class? */
+ mem->small_list[pool_id] = hdr_ptr;
+ else
+ prev_hdr_ptr->next = hdr_ptr;
+ }
+
+ /* OK, allocate the object from the current pool */
+ data_ptr = (char *)hdr_ptr; /* point to first data byte in pool... */
+ data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */
+ if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
+ data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
+ data_ptr += hdr_ptr->bytes_used; /* point to place for object */
+ hdr_ptr->bytes_used += sizeofobject;
+ hdr_ptr->bytes_left -= sizeofobject;
+
+ return (void *)data_ptr;
+}
+
+
+/*
+ * Allocation of "large" objects.
+ *
+ * The external semantics of these are the same as "small" objects. However,
+ * the pool management heuristics are quite different. We assume that each
+ * request is large enough that it may as well be passed directly to
+ * jpeg_get_large; the pool management just links everything together
+ * so that we can free it all on demand.
+ * Note: the major use of "large" objects is in JSAMPARRAY and JBLOCKARRAY
+ * structures. The routines that create these structures (see below)
+ * deliberately bunch rows together to ensure a large request size.
+ */
+
+METHODDEF(void *)
+alloc_large(j_common_ptr cinfo, int pool_id, size_t sizeofobject)
+/* Allocate a "large" object */
+{
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
+ large_pool_ptr hdr_ptr;
+ char *data_ptr;
+
+ /*
+ * Round up the requested size to a multiple of ALIGN_SIZE so that
+ * algorithms can straddle outside the proper area up to the next
+ * alignment.
+ */
+ if (sizeofobject > MAX_ALLOC_CHUNK) {
+ /* This prevents overflow/wrap-around in round_up_pow2() if sizeofobject
+ is close to SIZE_MAX. */
+ out_of_memory(cinfo, 8);
+ }
+ sizeofobject = round_up_pow2(sizeofobject, ALIGN_SIZE);
+
+ /* Check for unsatisfiable request (do now to ensure no overflow below) */
+ if ((sizeof(large_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) >
+ MAX_ALLOC_CHUNK)
+ out_of_memory(cinfo, 3); /* request exceeds malloc's ability */
+
+ /* Always make a new pool */
+ if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
+ ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
+
+ hdr_ptr = (large_pool_ptr)jpeg_get_large(cinfo, sizeofobject +
+ sizeof(large_pool_hdr) +
+ ALIGN_SIZE - 1);
+ if (hdr_ptr == NULL)
+ out_of_memory(cinfo, 4); /* jpeg_get_large failed */
+ mem->total_space_allocated += sizeofobject + sizeof(large_pool_hdr) +
+ ALIGN_SIZE - 1;
+
+ /* Success, initialize the new pool header and add to list */
+ hdr_ptr->next = mem->large_list[pool_id];
+ /* We maintain space counts in each pool header for statistical purposes,
+ * even though they are not needed for allocation.
+ */
+ hdr_ptr->bytes_used = sizeofobject;
+ hdr_ptr->bytes_left = 0;
+ mem->large_list[pool_id] = hdr_ptr;
+
+ data_ptr = (char *)hdr_ptr; /* point to first data byte in pool... */
+ data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */
+ if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
+ data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
+
+ return (void *)data_ptr;
+}
+
+
+/*
+ * Creation of 2-D sample arrays.
+ *
+ * To minimize allocation overhead and to allow I/O of large contiguous
+ * blocks, we allocate the sample rows in groups of as many rows as possible
+ * without exceeding MAX_ALLOC_CHUNK total bytes per allocation request.
+ * NB: the virtual array control routines, later in this file, know about
+ * this chunking of rows. The rowsperchunk value is left in the mem manager
+ * object so that it can be saved away if this sarray is the workspace for
+ * a virtual array.
+ *
+ * Since we are often upsampling with a factor 2, we align the size (not
+ * the start) to 2 * ALIGN_SIZE so that the upsampling routines don't have
+ * to be as careful about size.
+ */
+
+METHODDEF(JSAMPARRAY)
+alloc_sarray(j_common_ptr cinfo, int pool_id, JDIMENSION samplesperrow,
+ JDIMENSION numrows)
+/* Allocate a 2-D sample array */
+{
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
+ JSAMPARRAY result;
+ JSAMPROW workspace;
+ JDIMENSION rowsperchunk, currow, i;
+ long ltemp;
+
+ /* Make sure each row is properly aligned */
+ if ((ALIGN_SIZE % sizeof(JSAMPLE)) != 0)
+ out_of_memory(cinfo, 5); /* safety check */
+
+ if (samplesperrow > MAX_ALLOC_CHUNK) {
+ /* This prevents overflow/wrap-around in round_up_pow2() if sizeofobject
+ is close to SIZE_MAX. */
+ out_of_memory(cinfo, 9);
+ }
+ samplesperrow = (JDIMENSION)round_up_pow2(samplesperrow, (2 * ALIGN_SIZE) /
+ sizeof(JSAMPLE));
+
+ /* Calculate max # of rows allowed in one allocation chunk */
+ ltemp = (MAX_ALLOC_CHUNK - sizeof(large_pool_hdr)) /
+ ((long)samplesperrow * sizeof(JSAMPLE));
+ if (ltemp <= 0)
+ ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+ if (ltemp < (long)numrows)
+ rowsperchunk = (JDIMENSION)ltemp;
+ else
+ rowsperchunk = numrows;
+ mem->last_rowsperchunk = rowsperchunk;
+
+ /* Get space for row pointers (small object) */
+ result = (JSAMPARRAY)alloc_small(cinfo, pool_id,
+ (size_t)(numrows * sizeof(JSAMPROW)));
+
+ /* Get the rows themselves (large objects) */
+ currow = 0;
+ while (currow < numrows) {
+ rowsperchunk = MIN(rowsperchunk, numrows - currow);
+ workspace = (JSAMPROW)alloc_large(cinfo, pool_id,
+ (size_t)((size_t)rowsperchunk * (size_t)samplesperrow *
+ sizeof(JSAMPLE)));
+ for (i = rowsperchunk; i > 0; i--) {
+ result[currow++] = workspace;
+ workspace += samplesperrow;
+ }
+ }
+
+ return result;
+}
+
+
+/*
+ * Creation of 2-D coefficient-block arrays.
+ * This is essentially the same as the code for sample arrays, above.
+ */
+
+METHODDEF(JBLOCKARRAY)
+alloc_barray(j_common_ptr cinfo, int pool_id, JDIMENSION blocksperrow,
+ JDIMENSION numrows)
+/* Allocate a 2-D coefficient-block array */
+{
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
+ JBLOCKARRAY result;
+ JBLOCKROW workspace;
+ JDIMENSION rowsperchunk, currow, i;
+ long ltemp;
+
+ /* Make sure each row is properly aligned */
+ if ((sizeof(JBLOCK) % ALIGN_SIZE) != 0)
+ out_of_memory(cinfo, 6); /* safety check */
+
+ /* Calculate max # of rows allowed in one allocation chunk */
+ ltemp = (MAX_ALLOC_CHUNK - sizeof(large_pool_hdr)) /
+ ((long)blocksperrow * sizeof(JBLOCK));
+ if (ltemp <= 0)
+ ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+ if (ltemp < (long)numrows)
+ rowsperchunk = (JDIMENSION)ltemp;
+ else
+ rowsperchunk = numrows;
+ mem->last_rowsperchunk = rowsperchunk;
+
+ /* Get space for row pointers (small object) */
+ result = (JBLOCKARRAY)alloc_small(cinfo, pool_id,
+ (size_t)(numrows * sizeof(JBLOCKROW)));
+
+ /* Get the rows themselves (large objects) */
+ currow = 0;
+ while (currow < numrows) {
+ rowsperchunk = MIN(rowsperchunk, numrows - currow);
+ workspace = (JBLOCKROW)alloc_large(cinfo, pool_id,
+ (size_t)((size_t)rowsperchunk * (size_t)blocksperrow *
+ sizeof(JBLOCK)));
+ for (i = rowsperchunk; i > 0; i--) {
+ result[currow++] = workspace;
+ workspace += blocksperrow;
+ }
+ }
+
+ return result;
+}
+
+
+/*
+ * About virtual array management:
+ *
+ * The above "normal" array routines are only used to allocate strip buffers
+ * (as wide as the image, but just a few rows high). Full-image-sized buffers
+ * are handled as "virtual" arrays. The array is still accessed a strip at a
+ * time, but the memory manager must save the whole array for repeated
+ * accesses. The intended implementation is that there is a strip buffer in
+ * memory (as high as is possible given the desired memory limit), plus a
+ * backing file that holds the rest of the array.
+ *
+ * The request_virt_array routines are told the total size of the image and
+ * the maximum number of rows that will be accessed at once. The in-memory
+ * buffer must be at least as large as the maxaccess value.
+ *
+ * The request routines create control blocks but not the in-memory buffers.
+ * That is postponed until realize_virt_arrays is called. At that time the
+ * total amount of space needed is known (approximately, anyway), so free
+ * memory can be divided up fairly.
+ *
+ * The access_virt_array routines are responsible for making a specific strip
+ * area accessible (after reading or writing the backing file, if necessary).
+ * Note that the access routines are told whether the caller intends to modify
+ * the accessed strip; during a read-only pass this saves having to rewrite
+ * data to disk. The access routines are also responsible for pre-zeroing
+ * any newly accessed rows, if pre-zeroing was requested.
+ *
+ * In current usage, the access requests are usually for nonoverlapping
+ * strips; that is, successive access start_row numbers differ by exactly
+ * num_rows = maxaccess. This means we can get good performance with simple
+ * buffer dump/reload logic, by making the in-memory buffer be a multiple
+ * of the access height; then there will never be accesses across bufferload
+ * boundaries. The code will still work with overlapping access requests,
+ * but it doesn't handle bufferload overlaps very efficiently.
+ */
+
+
+METHODDEF(jvirt_sarray_ptr)
+request_virt_sarray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+ JDIMENSION samplesperrow, JDIMENSION numrows,
+ JDIMENSION maxaccess)
+/* Request a virtual 2-D sample array */
+{
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
+ jvirt_sarray_ptr result;
+
+ /* Only IMAGE-lifetime virtual arrays are currently supported */
+ if (pool_id != JPOOL_IMAGE)
+ ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
+
+ /* get control block */
+ result = (jvirt_sarray_ptr)alloc_small(cinfo, pool_id,
+ sizeof(struct jvirt_sarray_control));
+
+ result->mem_buffer = NULL; /* marks array not yet realized */
+ result->rows_in_array = numrows;
+ result->samplesperrow = samplesperrow;
+ result->maxaccess = maxaccess;
+ result->pre_zero = pre_zero;
+ result->b_s_open = FALSE; /* no associated backing-store object */
+ result->next = mem->virt_sarray_list; /* add to list of virtual arrays */
+ mem->virt_sarray_list = result;
+
+ return result;
+}
+
+
+METHODDEF(jvirt_barray_ptr)
+request_virt_barray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+ JDIMENSION blocksperrow, JDIMENSION numrows,
+ JDIMENSION maxaccess)
+/* Request a virtual 2-D coefficient-block array */
+{
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
+ jvirt_barray_ptr result;
+
+ /* Only IMAGE-lifetime virtual arrays are currently supported */
+ if (pool_id != JPOOL_IMAGE)
+ ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
+
+ /* get control block */
+ result = (jvirt_barray_ptr)alloc_small(cinfo, pool_id,
+ sizeof(struct jvirt_barray_control));
+
+ result->mem_buffer = NULL; /* marks array not yet realized */
+ result->rows_in_array = numrows;
+ result->blocksperrow = blocksperrow;
+ result->maxaccess = maxaccess;
+ result->pre_zero = pre_zero;
+ result->b_s_open = FALSE; /* no associated backing-store object */
+ result->next = mem->virt_barray_list; /* add to list of virtual arrays */
+ mem->virt_barray_list = result;
+
+ return result;
+}
+
+
+METHODDEF(void)
+realize_virt_arrays(j_common_ptr cinfo)
+/* Allocate the in-memory buffers for any unrealized virtual arrays */
+{
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
+ size_t space_per_minheight, maximum_space, avail_mem;
+ size_t minheights, max_minheights;
+ jvirt_sarray_ptr sptr;
+ jvirt_barray_ptr bptr;
+
+ /* Compute the minimum space needed (maxaccess rows in each buffer)
+ * and the maximum space needed (full image height in each buffer).
+ * These may be of use to the system-dependent jpeg_mem_available routine.
+ */
+ space_per_minheight = 0;
+ maximum_space = 0;
+ for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
+ if (sptr->mem_buffer == NULL) { /* if not realized yet */
+ size_t new_space = (long)sptr->rows_in_array *
+ (long)sptr->samplesperrow * sizeof(JSAMPLE);
+
+ space_per_minheight += (long)sptr->maxaccess *
+ (long)sptr->samplesperrow * sizeof(JSAMPLE);
+ if (SIZE_MAX - maximum_space < new_space)
+ out_of_memory(cinfo, 10);
+ maximum_space += new_space;
+ }
+ }
+ for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
+ if (bptr->mem_buffer == NULL) { /* if not realized yet */
+ size_t new_space = (long)bptr->rows_in_array *
+ (long)bptr->blocksperrow * sizeof(JBLOCK);
+
+ space_per_minheight += (long)bptr->maxaccess *
+ (long)bptr->blocksperrow * sizeof(JBLOCK);
+ if (SIZE_MAX - maximum_space < new_space)
+ out_of_memory(cinfo, 11);
+ maximum_space += new_space;
+ }
+ }
+
+ if (space_per_minheight <= 0)
+ return; /* no unrealized arrays, no work */
+
+ /* Determine amount of memory to actually use; this is system-dependent. */
+ avail_mem = jpeg_mem_available(cinfo, space_per_minheight, maximum_space,
+ mem->total_space_allocated);
+
+ /* If the maximum space needed is available, make all the buffers full
+ * height; otherwise parcel it out with the same number of minheights
+ * in each buffer.
+ */
+ if (avail_mem >= maximum_space)
+ max_minheights = 1000000000L;
+ else {
+ max_minheights = avail_mem / space_per_minheight;
+ /* If there doesn't seem to be enough space, try to get the minimum
+ * anyway. This allows a "stub" implementation of jpeg_mem_available().
+ */
+ if (max_minheights <= 0)
+ max_minheights = 1;
+ }
+
+ /* Allocate the in-memory buffers and initialize backing store as needed. */
+
+ for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
+ if (sptr->mem_buffer == NULL) { /* if not realized yet */
+ minheights = ((long)sptr->rows_in_array - 1L) / sptr->maxaccess + 1L;
+ if (minheights <= max_minheights) {
+ /* This buffer fits in memory */
+ sptr->rows_in_mem = sptr->rows_in_array;
+ } else {
+ /* It doesn't fit in memory, create backing store. */
+ sptr->rows_in_mem = (JDIMENSION)(max_minheights * sptr->maxaccess);
+ jpeg_open_backing_store(cinfo, &sptr->b_s_info,
+ (long)sptr->rows_in_array *
+ (long)sptr->samplesperrow *
+ (long)sizeof(JSAMPLE));
+ sptr->b_s_open = TRUE;
+ }
+ sptr->mem_buffer = alloc_sarray(cinfo, JPOOL_IMAGE,
+ sptr->samplesperrow, sptr->rows_in_mem);
+ sptr->rowsperchunk = mem->last_rowsperchunk;
+ sptr->cur_start_row = 0;
+ sptr->first_undef_row = 0;
+ sptr->dirty = FALSE;
+ }
+ }
+
+ for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
+ if (bptr->mem_buffer == NULL) { /* if not realized yet */
+ minheights = ((long)bptr->rows_in_array - 1L) / bptr->maxaccess + 1L;
+ if (minheights <= max_minheights) {
+ /* This buffer fits in memory */
+ bptr->rows_in_mem = bptr->rows_in_array;
+ } else {
+ /* It doesn't fit in memory, create backing store. */
+ bptr->rows_in_mem = (JDIMENSION)(max_minheights * bptr->maxaccess);
+ jpeg_open_backing_store(cinfo, &bptr->b_s_info,
+ (long)bptr->rows_in_array *
+ (long)bptr->blocksperrow *
+ (long)sizeof(JBLOCK));
+ bptr->b_s_open = TRUE;
+ }
+ bptr->mem_buffer = alloc_barray(cinfo, JPOOL_IMAGE,
+ bptr->blocksperrow, bptr->rows_in_mem);
+ bptr->rowsperchunk = mem->last_rowsperchunk;
+ bptr->cur_start_row = 0;
+ bptr->first_undef_row = 0;
+ bptr->dirty = FALSE;
+ }
+ }
+}
+
+
+LOCAL(void)
+do_sarray_io(j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
+/* Do backing store read or write of a virtual sample array */
+{
+ long bytesperrow, file_offset, byte_count, rows, thisrow, i;
+
+ bytesperrow = (long)ptr->samplesperrow * sizeof(JSAMPLE);
+ file_offset = ptr->cur_start_row * bytesperrow;
+ /* Loop to read or write each allocation chunk in mem_buffer */
+ for (i = 0; i < (long)ptr->rows_in_mem; i += ptr->rowsperchunk) {
+ /* One chunk, but check for short chunk at end of buffer */
+ rows = MIN((long)ptr->rowsperchunk, (long)ptr->rows_in_mem - i);
+ /* Transfer no more than is currently defined */
+ thisrow = (long)ptr->cur_start_row + i;
+ rows = MIN(rows, (long)ptr->first_undef_row - thisrow);
+ /* Transfer no more than fits in file */
+ rows = MIN(rows, (long)ptr->rows_in_array - thisrow);
+ if (rows <= 0) /* this chunk might be past end of file! */
+ break;
+ byte_count = rows * bytesperrow;
+ if (writing)
+ (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+ (void *)ptr->mem_buffer[i],
+ file_offset, byte_count);
+ else
+ (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+ (void *)ptr->mem_buffer[i],
+ file_offset, byte_count);
+ file_offset += byte_count;
+ }
+}
+
+
+LOCAL(void)
+do_barray_io(j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing)
+/* Do backing store read or write of a virtual coefficient-block array */
+{
+ long bytesperrow, file_offset, byte_count, rows, thisrow, i;
+
+ bytesperrow = (long)ptr->blocksperrow * sizeof(JBLOCK);
+ file_offset = ptr->cur_start_row * bytesperrow;
+ /* Loop to read or write each allocation chunk in mem_buffer */
+ for (i = 0; i < (long)ptr->rows_in_mem; i += ptr->rowsperchunk) {
+ /* One chunk, but check for short chunk at end of buffer */
+ rows = MIN((long)ptr->rowsperchunk, (long)ptr->rows_in_mem - i);
+ /* Transfer no more than is currently defined */
+ thisrow = (long)ptr->cur_start_row + i;
+ rows = MIN(rows, (long)ptr->first_undef_row - thisrow);
+ /* Transfer no more than fits in file */
+ rows = MIN(rows, (long)ptr->rows_in_array - thisrow);
+ if (rows <= 0) /* this chunk might be past end of file! */
+ break;
+ byte_count = rows * bytesperrow;
+ if (writing)
+ (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+ (void *)ptr->mem_buffer[i],
+ file_offset, byte_count);
+ else
+ (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+ (void *)ptr->mem_buffer[i],
+ file_offset, byte_count);
+ file_offset += byte_count;
+ }
+}
+
+
+METHODDEF(JSAMPARRAY)
+access_virt_sarray(j_common_ptr cinfo, jvirt_sarray_ptr ptr,
+ JDIMENSION start_row, JDIMENSION num_rows, boolean writable)
+/* Access the part of a virtual sample array starting at start_row */
+/* and extending for num_rows rows. writable is true if */
+/* caller intends to modify the accessed area. */
+{
+ JDIMENSION end_row = start_row + num_rows;
+ JDIMENSION undef_row;
+
+ /* debugging check */
+ if (end_row > ptr->rows_in_array || num_rows > ptr->maxaccess ||
+ ptr->mem_buffer == NULL)
+ ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
+
+ /* Make the desired part of the virtual array accessible */
+ if (start_row < ptr->cur_start_row ||
+ end_row > ptr->cur_start_row + ptr->rows_in_mem) {
+ if (!ptr->b_s_open)
+ ERREXIT(cinfo, JERR_VIRTUAL_BUG);
+ /* Flush old buffer contents if necessary */
+ if (ptr->dirty) {
+ do_sarray_io(cinfo, ptr, TRUE);
+ ptr->dirty = FALSE;
+ }
+ /* Decide what part of virtual array to access.
+ * Algorithm: if target address > current window, assume forward scan,
+ * load starting at target address. If target address < current window,
+ * assume backward scan, load so that target area is top of window.
+ * Note that when switching from forward write to forward read, will have
+ * start_row = 0, so the limiting case applies and we load from 0 anyway.
+ */
+ if (start_row > ptr->cur_start_row) {
+ ptr->cur_start_row = start_row;
+ } else {
+ /* use long arithmetic here to avoid overflow & unsigned problems */
+ long ltemp;
+
+ ltemp = (long)end_row - (long)ptr->rows_in_mem;
+ if (ltemp < 0)
+ ltemp = 0; /* don't fall off front end of file */
+ ptr->cur_start_row = (JDIMENSION)ltemp;
+ }
+ /* Read in the selected part of the array.
+ * During the initial write pass, we will do no actual read
+ * because the selected part is all undefined.
+ */
+ do_sarray_io(cinfo, ptr, FALSE);
+ }
+ /* Ensure the accessed part of the array is defined; prezero if needed.
+ * To improve locality of access, we only prezero the part of the array
+ * that the caller is about to access, not the entire in-memory array.
+ */
+ if (ptr->first_undef_row < end_row) {
+ if (ptr->first_undef_row < start_row) {
+ if (writable) /* writer skipped over a section of array */
+ ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
+ undef_row = start_row; /* but reader is allowed to read ahead */
+ } else {
+ undef_row = ptr->first_undef_row;
+ }
+ if (writable)
+ ptr->first_undef_row = end_row;
+ if (ptr->pre_zero) {
+ size_t bytesperrow = (size_t)ptr->samplesperrow * sizeof(JSAMPLE);
+ undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
+ end_row -= ptr->cur_start_row;
+ while (undef_row < end_row) {
+ jzero_far((void *)ptr->mem_buffer[undef_row], bytesperrow);
+ undef_row++;
+ }
+ } else {
+ if (!writable) /* reader looking at undefined data */
+ ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
+ }
+ }
+ /* Flag the buffer dirty if caller will write in it */
+ if (writable)
+ ptr->dirty = TRUE;
+ /* Return address of proper part of the buffer */
+ return ptr->mem_buffer + (start_row - ptr->cur_start_row);
+}
+
+
+METHODDEF(JBLOCKARRAY)
+access_virt_barray(j_common_ptr cinfo, jvirt_barray_ptr ptr,
+ JDIMENSION start_row, JDIMENSION num_rows, boolean writable)
+/* Access the part of a virtual block array starting at start_row */
+/* and extending for num_rows rows. writable is true if */
+/* caller intends to modify the accessed area. */
+{
+ JDIMENSION end_row = start_row + num_rows;
+ JDIMENSION undef_row;
+
+ /* debugging check */
+ if (end_row > ptr->rows_in_array || num_rows > ptr->maxaccess ||
+ ptr->mem_buffer == NULL)
+ ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
+
+ /* Make the desired part of the virtual array accessible */
+ if (start_row < ptr->cur_start_row ||
+ end_row > ptr->cur_start_row + ptr->rows_in_mem) {
+ if (!ptr->b_s_open)
+ ERREXIT(cinfo, JERR_VIRTUAL_BUG);
+ /* Flush old buffer contents if necessary */
+ if (ptr->dirty) {
+ do_barray_io(cinfo, ptr, TRUE);
+ ptr->dirty = FALSE;
+ }
+ /* Decide what part of virtual array to access.
+ * Algorithm: if target address > current window, assume forward scan,
+ * load starting at target address. If target address < current window,
+ * assume backward scan, load so that target area is top of window.
+ * Note that when switching from forward write to forward read, will have
+ * start_row = 0, so the limiting case applies and we load from 0 anyway.
+ */
+ if (start_row > ptr->cur_start_row) {
+ ptr->cur_start_row = start_row;
+ } else {
+ /* use long arithmetic here to avoid overflow & unsigned problems */
+ long ltemp;
+
+ ltemp = (long)end_row - (long)ptr->rows_in_mem;
+ if (ltemp < 0)
+ ltemp = 0; /* don't fall off front end of file */
+ ptr->cur_start_row = (JDIMENSION)ltemp;
+ }
+ /* Read in the selected part of the array.
+ * During the initial write pass, we will do no actual read
+ * because the selected part is all undefined.
+ */
+ do_barray_io(cinfo, ptr, FALSE);
+ }
+ /* Ensure the accessed part of the array is defined; prezero if needed.
+ * To improve locality of access, we only prezero the part of the array
+ * that the caller is about to access, not the entire in-memory array.
+ */
+ if (ptr->first_undef_row < end_row) {
+ if (ptr->first_undef_row < start_row) {
+ if (writable) /* writer skipped over a section of array */
+ ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
+ undef_row = start_row; /* but reader is allowed to read ahead */
+ } else {
+ undef_row = ptr->first_undef_row;
+ }
+ if (writable)
+ ptr->first_undef_row = end_row;
+ if (ptr->pre_zero) {
+ size_t bytesperrow = (size_t)ptr->blocksperrow * sizeof(JBLOCK);
+ undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
+ end_row -= ptr->cur_start_row;
+ while (undef_row < end_row) {
+ jzero_far((void *)ptr->mem_buffer[undef_row], bytesperrow);
+ undef_row++;
+ }
+ } else {
+ if (!writable) /* reader looking at undefined data */
+ ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
+ }
+ }
+ /* Flag the buffer dirty if caller will write in it */
+ if (writable)
+ ptr->dirty = TRUE;
+ /* Return address of proper part of the buffer */
+ return ptr->mem_buffer + (start_row - ptr->cur_start_row);
+}
+
+
+/*
+ * Release all objects belonging to a specified pool.
+ */
+
+METHODDEF(void)
+free_pool(j_common_ptr cinfo, int pool_id)
+{
+ my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
+ small_pool_ptr shdr_ptr;
+ large_pool_ptr lhdr_ptr;
+ size_t space_freed;
+
+ if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
+ ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
+
+#ifdef MEM_STATS
+ if (cinfo->err->trace_level > 1)
+ print_mem_stats(cinfo, pool_id); /* print pool's memory usage statistics */
+#endif
+
+ /* If freeing IMAGE pool, close any virtual arrays first */
+ if (pool_id == JPOOL_IMAGE) {
+ jvirt_sarray_ptr sptr;
+ jvirt_barray_ptr bptr;
+
+ for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
+ if (sptr->b_s_open) { /* there may be no backing store */
+ sptr->b_s_open = FALSE; /* prevent recursive close if error */
+ (*sptr->b_s_info.close_backing_store) (cinfo, &sptr->b_s_info);
+ }
+ }
+ mem->virt_sarray_list = NULL;
+ for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
+ if (bptr->b_s_open) { /* there may be no backing store */
+ bptr->b_s_open = FALSE; /* prevent recursive close if error */
+ (*bptr->b_s_info.close_backing_store) (cinfo, &bptr->b_s_info);
+ }
+ }
+ mem->virt_barray_list = NULL;
+ }
+
+ /* Release large objects */
+ lhdr_ptr = mem->large_list[pool_id];
+ mem->large_list[pool_id] = NULL;
+
+ while (lhdr_ptr != NULL) {
+ large_pool_ptr next_lhdr_ptr = lhdr_ptr->next;
+ space_freed = lhdr_ptr->bytes_used +
+ lhdr_ptr->bytes_left +
+ sizeof(large_pool_hdr) + ALIGN_SIZE - 1;
+ jpeg_free_large(cinfo, (void *)lhdr_ptr, space_freed);
+ mem->total_space_allocated -= space_freed;
+ lhdr_ptr = next_lhdr_ptr;
+ }
+
+ /* Release small objects */
+ shdr_ptr = mem->small_list[pool_id];
+ mem->small_list[pool_id] = NULL;
+
+ while (shdr_ptr != NULL) {
+ small_pool_ptr next_shdr_ptr = shdr_ptr->next;
+ space_freed = shdr_ptr->bytes_used + shdr_ptr->bytes_left +
+ sizeof(small_pool_hdr) + ALIGN_SIZE - 1;
+ jpeg_free_small(cinfo, (void *)shdr_ptr, space_freed);
+ mem->total_space_allocated -= space_freed;
+ shdr_ptr = next_shdr_ptr;
+ }
+}
+
+
+/*
+ * Close up shop entirely.
+ * Note that this cannot be called unless cinfo->mem is non-NULL.
+ */
+
+METHODDEF(void)
+self_destruct(j_common_ptr cinfo)
+{
+ int pool;
+
+ /* Close all backing store, release all memory.
+ * Releasing pools in reverse order might help avoid fragmentation
+ * with some (brain-damaged) malloc libraries.
+ */
+ for (pool = JPOOL_NUMPOOLS - 1; pool >= JPOOL_PERMANENT; pool--) {
+ free_pool(cinfo, pool);
+ }
+
+ /* Release the memory manager control block too. */
+ jpeg_free_small(cinfo, (void *)cinfo->mem, sizeof(my_memory_mgr));
+ cinfo->mem = NULL; /* ensures I will be called only once */
+
+ jpeg_mem_term(cinfo); /* system-dependent cleanup */
+}
+
+
+/*
+ * Memory manager initialization.
+ * When this is called, only the error manager pointer is valid in cinfo!
+ */
+
+GLOBAL(void)
+jinit_memory_mgr(j_common_ptr cinfo)
+{
+ my_mem_ptr mem;
+ long max_to_use;
+ int pool;
+ size_t test_mac;
+
+ cinfo->mem = NULL; /* for safety if init fails */
+
+ /* Check for configuration errors.
+ * sizeof(ALIGN_TYPE) should be a power of 2; otherwise, it probably
+ * doesn't reflect any real hardware alignment requirement.
+ * The test is a little tricky: for X>0, X and X-1 have no one-bits
+ * in common if and only if X is a power of 2, ie has only one one-bit.
+ * Some compilers may give an "unreachable code" warning here; ignore it.
+ */
+ if ((ALIGN_SIZE & (ALIGN_SIZE - 1)) != 0)
+ ERREXIT(cinfo, JERR_BAD_ALIGN_TYPE);
+ /* MAX_ALLOC_CHUNK must be representable as type size_t, and must be
+ * a multiple of ALIGN_SIZE.
+ * Again, an "unreachable code" warning may be ignored here.
+ * But a "constant too large" warning means you need to fix MAX_ALLOC_CHUNK.
+ */
+ test_mac = (size_t)MAX_ALLOC_CHUNK;
+ if ((long)test_mac != MAX_ALLOC_CHUNK ||
+ (MAX_ALLOC_CHUNK % ALIGN_SIZE) != 0)
+ ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
+
+ max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */
+
+ /* Attempt to allocate memory manager's control block */
+ mem = (my_mem_ptr)jpeg_get_small(cinfo, sizeof(my_memory_mgr));
+
+ if (mem == NULL) {
+ jpeg_mem_term(cinfo); /* system-dependent cleanup */
+ ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 0);
+ }
+
+ /* OK, fill in the method pointers */
+ mem->pub.alloc_small = alloc_small;
+ mem->pub.alloc_large = alloc_large;
+ mem->pub.alloc_sarray = alloc_sarray;
+ mem->pub.alloc_barray = alloc_barray;
+ mem->pub.request_virt_sarray = request_virt_sarray;
+ mem->pub.request_virt_barray = request_virt_barray;
+ mem->pub.realize_virt_arrays = realize_virt_arrays;
+ mem->pub.access_virt_sarray = access_virt_sarray;
+ mem->pub.access_virt_barray = access_virt_barray;
+ mem->pub.free_pool = free_pool;
+ mem->pub.self_destruct = self_destruct;
+
+ /* Make MAX_ALLOC_CHUNK accessible to other modules */
+ mem->pub.max_alloc_chunk = MAX_ALLOC_CHUNK;
+
+ /* Initialize working state */
+ mem->pub.max_memory_to_use = max_to_use;
+
+ for (pool = JPOOL_NUMPOOLS - 1; pool >= JPOOL_PERMANENT; pool--) {
+ mem->small_list[pool] = NULL;
+ mem->large_list[pool] = NULL;
+ }
+ mem->virt_sarray_list = NULL;
+ mem->virt_barray_list = NULL;
+
+ mem->total_space_allocated = sizeof(my_memory_mgr);
+
+ /* Declare ourselves open for business */
+ cinfo->mem = &mem->pub;
+
+ /* Check for an environment variable JPEGMEM; if found, override the
+ * default max_memory setting from jpeg_mem_init. Note that the
+ * surrounding application may again override this value.
+ * If your system doesn't support getenv(), define NO_GETENV to disable
+ * this feature.
+ */
+#ifndef NO_GETENV
+ {
+ char memenv[30] = { 0 };
+
+ if (!GETENV_S(memenv, 30, "JPEGMEM") && strlen(memenv) > 0) {
+ char ch = 'x';
+
+#ifdef _MSC_VER
+ if (sscanf_s(memenv, "%ld%c", &max_to_use, &ch, 1) > 0) {
+#else
+ if (sscanf(memenv, "%ld%c", &max_to_use, &ch) > 0) {
+#endif
+ if (ch == 'm' || ch == 'M')
+ max_to_use *= 1000L;
+ mem->pub.max_memory_to_use = max_to_use * 1000L;
+ }
+ }
+ }
+#endif
+
+}
diff --git a/media/libjpeg/jmemnobs.c b/media/libjpeg/jmemnobs.c
new file mode 100644
index 0000000000..cd6571ba1c
--- /dev/null
+++ b/media/libjpeg/jmemnobs.c
@@ -0,0 +1,110 @@
+/*
+ * jmemnobs.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1992-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2017-2018, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides a really simple implementation of the system-
+ * dependent portion of the JPEG memory manager. This implementation
+ * assumes that no backing-store files are needed: all required space
+ * can be obtained from malloc().
+ * This is very portable in the sense that it'll compile on almost anything,
+ * but you'd better have lots of main memory (or virtual memory) if you want
+ * to process big images.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jmemsys.h" /* import the system-dependent declarations */
+
+
+/*
+ * Memory allocation and freeing are controlled by the regular library
+ * routines malloc() and free().
+ */
+
+GLOBAL(void *)
+jpeg_get_small(j_common_ptr cinfo, size_t sizeofobject)
+{
+ return (void *)malloc(sizeofobject);
+}
+
+GLOBAL(void)
+jpeg_free_small(j_common_ptr cinfo, void *object, size_t sizeofobject)
+{
+ free(object);
+}
+
+
+/*
+ * "Large" objects are treated the same as "small" ones.
+ */
+
+GLOBAL(void *)
+jpeg_get_large(j_common_ptr cinfo, size_t sizeofobject)
+{
+ return (void *)malloc(sizeofobject);
+}
+
+GLOBAL(void)
+jpeg_free_large(j_common_ptr cinfo, void *object, size_t sizeofobject)
+{
+ free(object);
+}
+
+
+/*
+ * This routine computes the total memory space available for allocation.
+ */
+
+GLOBAL(size_t)
+jpeg_mem_available(j_common_ptr cinfo, size_t min_bytes_needed,
+ size_t max_bytes_needed, size_t already_allocated)
+{
+ if (cinfo->mem->max_memory_to_use) {
+ if ((size_t)cinfo->mem->max_memory_to_use > already_allocated)
+ return cinfo->mem->max_memory_to_use - already_allocated;
+ else
+ return 0;
+ } else {
+ /* Here we always say, "we got all you want bud!" */
+ return max_bytes_needed;
+ }
+}
+
+
+/*
+ * Backing store (temporary file) management.
+ * Since jpeg_mem_available always promised the moon,
+ * this should never be called and we can just error out.
+ */
+
+GLOBAL(void)
+jpeg_open_backing_store(j_common_ptr cinfo, backing_store_ptr info,
+ long total_bytes_needed)
+{
+ ERREXIT(cinfo, JERR_NO_BACKING_STORE);
+}
+
+
+/*
+ * These routines take care of any system-dependent initialization and
+ * cleanup required. Here, there isn't any.
+ */
+
+GLOBAL(long)
+jpeg_mem_init(j_common_ptr cinfo)
+{
+ return 0; /* just set max_memory_to_use to 0 */
+}
+
+GLOBAL(void)
+jpeg_mem_term(j_common_ptr cinfo)
+{
+ /* no work */
+}
diff --git a/media/libjpeg/jmemsys.h b/media/libjpeg/jmemsys.h
new file mode 100644
index 0000000000..9229550afd
--- /dev/null
+++ b/media/libjpeg/jmemsys.h
@@ -0,0 +1,178 @@
+/*
+ * jmemsys.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1992-1997, Thomas G. Lane.
+ * It was modified by The libjpeg-turbo Project to include only code and
+ * information relevant to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This include file defines the interface between the system-independent
+ * and system-dependent portions of the JPEG memory manager. No other
+ * modules need include it. (The system-independent portion is jmemmgr.c;
+ * there are several different versions of the system-dependent portion.)
+ *
+ * This file works as-is for the system-dependent memory managers supplied
+ * in the IJG distribution. You may need to modify it if you write a
+ * custom memory manager. If system-dependent changes are needed in
+ * this file, the best method is to #ifdef them based on a configuration
+ * symbol supplied in jconfig.h.
+ */
+
+
+/*
+ * These two functions are used to allocate and release small chunks of
+ * memory. (Typically the total amount requested through jpeg_get_small is
+ * no more than 20K or so; this will be requested in chunks of a few K each.)
+ * Behavior should be the same as for the standard library functions malloc
+ * and free; in particular, jpeg_get_small must return NULL on failure.
+ * On most systems, these ARE malloc and free. jpeg_free_small is passed the
+ * size of the object being freed, just in case it's needed.
+ */
+
+EXTERN(void *) jpeg_get_small(j_common_ptr cinfo, size_t sizeofobject);
+EXTERN(void) jpeg_free_small(j_common_ptr cinfo, void *object,
+ size_t sizeofobject);
+
+/*
+ * These two functions are used to allocate and release large chunks of
+ * memory (up to the total free space designated by jpeg_mem_available).
+ * These are identical to the jpeg_get/free_small routines; but we keep them
+ * separate anyway, in case a different allocation strategy is desirable for
+ * large chunks.
+ */
+
+EXTERN(void *) jpeg_get_large(j_common_ptr cinfo, size_t sizeofobject);
+EXTERN(void) jpeg_free_large(j_common_ptr cinfo, void *object,
+ size_t sizeofobject);
+
+/*
+ * The macro MAX_ALLOC_CHUNK designates the maximum number of bytes that may
+ * be requested in a single call to jpeg_get_large (and jpeg_get_small for that
+ * matter, but that case should never come into play). This macro was needed
+ * to model the 64Kb-segment-size limit of far addressing on 80x86 machines.
+ * On machines with flat address spaces, any large constant may be used.
+ *
+ * NB: jmemmgr.c expects that MAX_ALLOC_CHUNK will be representable as type
+ * size_t and will be a multiple of sizeof(align_type).
+ */
+
+#ifndef MAX_ALLOC_CHUNK /* may be overridden in jconfig.h */
+#define MAX_ALLOC_CHUNK 1000000000L
+#endif
+
+/*
+ * This routine computes the total space still available for allocation by
+ * jpeg_get_large. If more space than this is needed, backing store will be
+ * used. NOTE: any memory already allocated must not be counted.
+ *
+ * There is a minimum space requirement, corresponding to the minimum
+ * feasible buffer sizes; jmemmgr.c will request that much space even if
+ * jpeg_mem_available returns zero. The maximum space needed, enough to hold
+ * all working storage in memory, is also passed in case it is useful.
+ * Finally, the total space already allocated is passed. If no better
+ * method is available, cinfo->mem->max_memory_to_use - already_allocated
+ * is often a suitable calculation.
+ *
+ * It is OK for jpeg_mem_available to underestimate the space available
+ * (that'll just lead to more backing-store access than is really necessary).
+ * However, an overestimate will lead to failure. Hence it's wise to subtract
+ * a slop factor from the true available space. 5% should be enough.
+ *
+ * On machines with lots of virtual memory, any large constant may be returned.
+ * Conversely, zero may be returned to always use the minimum amount of memory.
+ */
+
+EXTERN(size_t) jpeg_mem_available(j_common_ptr cinfo, size_t min_bytes_needed,
+ size_t max_bytes_needed,
+ size_t already_allocated);
+
+
+/*
+ * This structure holds whatever state is needed to access a single
+ * backing-store object. The read/write/close method pointers are called
+ * by jmemmgr.c to manipulate the backing-store object; all other fields
+ * are private to the system-dependent backing store routines.
+ */
+
+#define TEMP_NAME_LENGTH 64 /* max length of a temporary file's name */
+
+
+#ifdef USE_MSDOS_MEMMGR /* DOS-specific junk */
+
+typedef unsigned short XMSH; /* type of extended-memory handles */
+typedef unsigned short EMSH; /* type of expanded-memory handles */
+
+typedef union {
+ short file_handle; /* DOS file handle if it's a temp file */
+ XMSH xms_handle; /* handle if it's a chunk of XMS */
+ EMSH ems_handle; /* handle if it's a chunk of EMS */
+} handle_union;
+
+#endif /* USE_MSDOS_MEMMGR */
+
+#ifdef USE_MAC_MEMMGR /* Mac-specific junk */
+#include <Files.h>
+#endif /* USE_MAC_MEMMGR */
+
+
+typedef struct backing_store_struct *backing_store_ptr;
+
+typedef struct backing_store_struct {
+ /* Methods for reading/writing/closing this backing-store object */
+ void (*read_backing_store) (j_common_ptr cinfo, backing_store_ptr info,
+ void *buffer_address, long file_offset,
+ long byte_count);
+ void (*write_backing_store) (j_common_ptr cinfo, backing_store_ptr info,
+ void *buffer_address, long file_offset,
+ long byte_count);
+ void (*close_backing_store) (j_common_ptr cinfo, backing_store_ptr info);
+
+ /* Private fields for system-dependent backing-store management */
+#ifdef USE_MSDOS_MEMMGR
+ /* For the MS-DOS manager (jmemdos.c), we need: */
+ handle_union handle; /* reference to backing-store storage object */
+ char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */
+#else
+#ifdef USE_MAC_MEMMGR
+ /* For the Mac manager (jmemmac.c), we need: */
+ short temp_file; /* file reference number to temp file */
+ FSSpec tempSpec; /* the FSSpec for the temp file */
+ char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */
+#else
+ /* For a typical implementation with temp files, we need: */
+ FILE *temp_file; /* stdio reference to temp file */
+ char temp_name[TEMP_NAME_LENGTH]; /* name of temp file */
+#endif
+#endif
+} backing_store_info;
+
+
+/*
+ * Initial opening of a backing-store object. This must fill in the
+ * read/write/close pointers in the object. The read/write routines
+ * may take an error exit if the specified maximum file size is exceeded.
+ * (If jpeg_mem_available always returns a large value, this routine can
+ * just take an error exit.)
+ */
+
+EXTERN(void) jpeg_open_backing_store(j_common_ptr cinfo,
+ backing_store_ptr info,
+ long total_bytes_needed);
+
+
+/*
+ * These routines take care of any system-dependent initialization and
+ * cleanup required. jpeg_mem_init will be called before anything is
+ * allocated (and, therefore, nothing in cinfo is of use except the error
+ * manager pointer). It should return a suitable default value for
+ * max_memory_to_use; this may subsequently be overridden by the surrounding
+ * application. (Note that max_memory_to_use is only important if
+ * jpeg_mem_available chooses to consult it ... no one else will.)
+ * jpeg_mem_term may assume that all requested memory has been freed and that
+ * all opened backing-store objects have been closed.
+ */
+
+EXTERN(long) jpeg_mem_init(j_common_ptr cinfo);
+EXTERN(void) jpeg_mem_term(j_common_ptr cinfo);
diff --git a/media/libjpeg/jmorecfg.h b/media/libjpeg/jmorecfg.h
new file mode 100644
index 0000000000..8cda8041b2
--- /dev/null
+++ b/media/libjpeg/jmorecfg.h
@@ -0,0 +1,373 @@
+/*
+ * jmorecfg.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2011, 2014-2015, 2018, 2020, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains additional configuration options that customize the
+ * JPEG software for special applications or support machine-dependent
+ * optimizations. Most users will not need to touch this file.
+ */
+
+#include <stdint.h>
+
+/*
+ * Maximum number of components (color channels) allowed in JPEG image.
+ * To meet the letter of Rec. ITU-T T.81 | ISO/IEC 10918-1, set this to 255.
+ * However, darn few applications need more than 4 channels (maybe 5 for CMYK +
+ * alpha mask). We recommend 10 as a reasonable compromise; use 4 if you are
+ * really short on memory. (Each allowed component costs a hundred or so
+ * bytes of storage, whether actually used in an image or not.)
+ */
+
+#define MAX_COMPONENTS 10 /* maximum number of image components */
+
+
+/*
+ * Basic data types.
+ * You may need to change these if you have a machine with unusual data
+ * type sizes; for example, "char" not 8 bits, "short" not 16 bits,
+ * or "long" not 32 bits. We don't care whether "int" is 16 or 32 bits,
+ * but it had better be at least 16.
+ */
+
+/* Representation of a single sample (pixel element value).
+ * We frequently allocate large arrays of these, so it's important to keep
+ * them small. But if you have memory to burn and access to char or short
+ * arrays is very slow on your hardware, you might want to change these.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+/* JSAMPLE should be the smallest type that will hold the values 0..255.
+ */
+
+typedef unsigned char JSAMPLE;
+#define GETJSAMPLE(value) ((int)(value))
+
+#define MAXJSAMPLE 255
+#define CENTERJSAMPLE 128
+
+#endif /* BITS_IN_JSAMPLE == 8 */
+
+
+#if BITS_IN_JSAMPLE == 12
+/* JSAMPLE should be the smallest type that will hold the values 0..4095.
+ * On nearly all machines "short" will do nicely.
+ */
+
+typedef short JSAMPLE;
+#define GETJSAMPLE(value) ((int)(value))
+
+#define MAXJSAMPLE 4095
+#define CENTERJSAMPLE 2048
+
+#endif /* BITS_IN_JSAMPLE == 12 */
+
+
+/* Representation of a DCT frequency coefficient.
+ * This should be a signed value of at least 16 bits; "short" is usually OK.
+ * Again, we allocate large arrays of these, but you can change to int
+ * if you have memory to burn and "short" is really slow.
+ */
+
+typedef short JCOEF;
+
+
+/* Compressed datastreams are represented as arrays of JOCTET.
+ * These must be EXACTLY 8 bits wide, at least once they are written to
+ * external storage. Note that when using the stdio data source/destination
+ * managers, this is also the data type passed to fread/fwrite.
+ */
+
+typedef unsigned char JOCTET;
+#define GETJOCTET(value) (value)
+
+
+/* These typedefs are used for various table entries and so forth.
+ * They must be at least as wide as specified; but making them too big
+ * won't cost a huge amount of memory, so we don't provide special
+ * extraction code like we did for JSAMPLE. (In other words, these
+ * typedefs live at a different point on the speed/space tradeoff curve.)
+ */
+
+/* UINT8 must hold at least the values 0..255. */
+
+typedef uint8_t UINT8;
+
+/* UINT16 must hold at least the values 0..65535. */
+
+typedef uint16_t UINT16;
+
+/* INT16 must hold at least the values -32768..32767. */
+
+typedef int16_t INT16;
+
+/* INT32 must hold at least signed 32-bit values.
+ *
+ * NOTE: The INT32 typedef dates back to libjpeg v5 (1994.) Integers were
+ * sometimes 16-bit back then (MS-DOS), which is why INT32 is typedef'd to
+ * long. It also wasn't common (or at least as common) in 1994 for INT32 to be
+ * defined by platform headers. Since then, however, INT32 is defined in
+ * several other common places:
+ *
+ * Xmd.h (X11 header) typedefs INT32 to int on 64-bit platforms and long on
+ * 32-bit platforms (i.e always a 32-bit signed type.)
+ *
+ * basetsd.h (Win32 header) typedefs INT32 to int (always a 32-bit signed type
+ * on modern platforms.)
+ *
+ * qglobal.h (Qt header) typedefs INT32 to int (always a 32-bit signed type on
+ * modern platforms.)
+ *
+ * This is a recipe for conflict, since "long" and "int" aren't always
+ * compatible types. Since the definition of INT32 has technically been part
+ * of the libjpeg API for more than 20 years, we can't remove it, but we do not
+ * use it internally any longer. We instead define a separate type (JLONG)
+ * for internal use, which ensures that internal behavior will always be the
+ * same regardless of any external headers that may be included.
+ */
+
+typedef int32_t INT32;
+
+/* Datatype used for image dimensions. The JPEG standard only supports
+ * images up to 64K*64K due to 16-bit fields in SOF markers. Therefore
+ * "unsigned int" is sufficient on all machines. However, if you need to
+ * handle larger images and you don't mind deviating from the spec, you
+ * can change this datatype. (Note that changing this datatype will
+ * potentially require modifying the SIMD code. The x86-64 SIMD extensions,
+ * in particular, assume a 32-bit JDIMENSION.)
+ */
+
+typedef unsigned int JDIMENSION;
+
+#define JPEG_MAX_DIMENSION 65500L /* a tad under 64K to prevent overflows */
+
+
+/* These macros are used in all function definitions and extern declarations.
+ * You could modify them if you need to change function linkage conventions;
+ * in particular, you'll need to do that to make the library a Windows DLL.
+ * Another application is to make all functions global for use with debuggers
+ * or code profilers that require it.
+ */
+
+/* a function called through method pointers: */
+#define METHODDEF(type) static type
+/* a function used only in its module: */
+#define LOCAL(type) static type
+/* a function referenced thru EXTERNs: */
+#define GLOBAL(type) type
+/* a reference to a GLOBAL function: */
+#define EXTERN(type) extern type
+
+
+/* Originally, this macro was used as a way of defining function prototypes
+ * for both modern compilers as well as older compilers that did not support
+ * prototype parameters. libjpeg-turbo has never supported these older,
+ * non-ANSI compilers, but the macro is still included because there is some
+ * software out there that uses it.
+ */
+
+#define JMETHOD(type, methodname, arglist) type (*methodname) arglist
+
+
+/* libjpeg-turbo no longer supports platforms that have far symbols (MS-DOS),
+ * but again, some software relies on this macro.
+ */
+
+#undef FAR
+#define FAR
+
+
+/*
+ * On a few systems, type boolean and/or its values FALSE, TRUE may appear
+ * in standard header files. Or you may have conflicts with application-
+ * specific header files that you want to include together with these files.
+ * Defining HAVE_BOOLEAN before including jpeglib.h should make it work.
+ */
+
+#ifndef HAVE_BOOLEAN
+typedef int boolean;
+#endif
+#ifndef FALSE /* in case these macros already exist */
+#define FALSE 0 /* values of boolean */
+#endif
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+
+/*
+ * The remaining options affect code selection within the JPEG library,
+ * but they don't need to be visible to most applications using the library.
+ * To minimize application namespace pollution, the symbols won't be
+ * defined unless JPEG_INTERNALS or JPEG_INTERNAL_OPTIONS has been defined.
+ */
+
+#ifdef JPEG_INTERNALS
+#define JPEG_INTERNAL_OPTIONS
+#endif
+
+#ifdef JPEG_INTERNAL_OPTIONS
+
+
+/*
+ * These defines indicate whether to include various optional functions.
+ * Undefining some of these symbols will produce a smaller but less capable
+ * library. Note that you can leave certain source files out of the
+ * compilation/linking process if you've #undef'd the corresponding symbols.
+ * (You may HAVE to do that if your compiler doesn't like null source files.)
+ */
+
+/* Capability options common to encoder and decoder: */
+
+#define DCT_ISLOW_SUPPORTED /* accurate integer method */
+#define DCT_IFAST_SUPPORTED /* less accurate int method [legacy feature] */
+#define DCT_FLOAT_SUPPORTED /* floating-point method [legacy feature] */
+
+/* Encoder capability options: */
+
+#define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
+#define C_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/
+#define ENTROPY_OPT_SUPPORTED /* Optimization of entropy coding parms? */
+/* Note: if you selected 12-bit data precision, it is dangerous to turn off
+ * ENTROPY_OPT_SUPPORTED. The standard Huffman tables are only good for 8-bit
+ * precision, so jchuff.c normally uses entropy optimization to compute
+ * usable tables for higher precision. If you don't want to do optimization,
+ * you'll have to supply different default Huffman tables.
+ * The exact same statements apply for progressive JPEG: the default tables
+ * don't work for progressive mode. (This may get fixed, however.)
+ */
+#define INPUT_SMOOTHING_SUPPORTED /* Input image smoothing option? */
+
+/* Decoder capability options: */
+
+#define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
+#define D_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/
+#define SAVE_MARKERS_SUPPORTED /* jpeg_save_markers() needed? */
+#define BLOCK_SMOOTHING_SUPPORTED /* Block smoothing? (Progressive only) */
+#define IDCT_SCALING_SUPPORTED /* Output rescaling via IDCT? */
+#undef UPSAMPLE_SCALING_SUPPORTED /* Output rescaling at upsample stage? */
+#define UPSAMPLE_MERGING_SUPPORTED /* Fast path for sloppy upsampling? */
+#define QUANT_1PASS_SUPPORTED /* 1-pass color quantization? */
+#define QUANT_2PASS_SUPPORTED /* 2-pass color quantization? */
+
+/* more capability options later, no doubt */
+
+
+/*
+ * The RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE macros are a vestigial
+ * feature of libjpeg. The idea was that, if an application developer needed
+ * to compress from/decompress to a BGR/BGRX/RGBX/XBGR/XRGB buffer, they could
+ * change these macros, rebuild libjpeg, and link their application statically
+ * with it. In reality, few people ever did this, because there were some
+ * severe restrictions involved (cjpeg and djpeg no longer worked properly,
+ * compressing/decompressing RGB JPEGs no longer worked properly, and the color
+ * quantizer wouldn't work with pixel sizes other than 3.) Furthermore, since
+ * all of the O/S-supplied versions of libjpeg were built with the default
+ * values of RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE, many applications
+ * have come to regard these values as immutable.
+ *
+ * The libjpeg-turbo colorspace extensions provide a much cleaner way of
+ * compressing from/decompressing to buffers with arbitrary component orders
+ * and pixel sizes. Thus, we do not support changing the values of RGB_RED,
+ * RGB_GREEN, RGB_BLUE, or RGB_PIXELSIZE. In addition to the restrictions
+ * listed above, changing these values will also break the SIMD extensions and
+ * the regression tests.
+ */
+
+#define RGB_RED 0 /* Offset of Red in an RGB scanline element */
+#define RGB_GREEN 1 /* Offset of Green */
+#define RGB_BLUE 2 /* Offset of Blue */
+#define RGB_PIXELSIZE 3 /* JSAMPLEs per RGB scanline element */
+
+#define JPEG_NUMCS 17
+
+#define EXT_RGB_RED 0
+#define EXT_RGB_GREEN 1
+#define EXT_RGB_BLUE 2
+#define EXT_RGB_PIXELSIZE 3
+
+#define EXT_RGBX_RED 0
+#define EXT_RGBX_GREEN 1
+#define EXT_RGBX_BLUE 2
+#define EXT_RGBX_PIXELSIZE 4
+
+#define EXT_BGR_RED 2
+#define EXT_BGR_GREEN 1
+#define EXT_BGR_BLUE 0
+#define EXT_BGR_PIXELSIZE 3
+
+#define EXT_BGRX_RED 2
+#define EXT_BGRX_GREEN 1
+#define EXT_BGRX_BLUE 0
+#define EXT_BGRX_PIXELSIZE 4
+
+#define EXT_XBGR_RED 3
+#define EXT_XBGR_GREEN 2
+#define EXT_XBGR_BLUE 1
+#define EXT_XBGR_PIXELSIZE 4
+
+#define EXT_XRGB_RED 1
+#define EXT_XRGB_GREEN 2
+#define EXT_XRGB_BLUE 3
+#define EXT_XRGB_PIXELSIZE 4
+
+static const int rgb_red[JPEG_NUMCS] = {
+ -1, -1, RGB_RED, -1, -1, -1, EXT_RGB_RED, EXT_RGBX_RED,
+ EXT_BGR_RED, EXT_BGRX_RED, EXT_XBGR_RED, EXT_XRGB_RED,
+ EXT_RGBX_RED, EXT_BGRX_RED, EXT_XBGR_RED, EXT_XRGB_RED,
+ -1
+};
+
+static const int rgb_green[JPEG_NUMCS] = {
+ -1, -1, RGB_GREEN, -1, -1, -1, EXT_RGB_GREEN, EXT_RGBX_GREEN,
+ EXT_BGR_GREEN, EXT_BGRX_GREEN, EXT_XBGR_GREEN, EXT_XRGB_GREEN,
+ EXT_RGBX_GREEN, EXT_BGRX_GREEN, EXT_XBGR_GREEN, EXT_XRGB_GREEN,
+ -1
+};
+
+static const int rgb_blue[JPEG_NUMCS] = {
+ -1, -1, RGB_BLUE, -1, -1, -1, EXT_RGB_BLUE, EXT_RGBX_BLUE,
+ EXT_BGR_BLUE, EXT_BGRX_BLUE, EXT_XBGR_BLUE, EXT_XRGB_BLUE,
+ EXT_RGBX_BLUE, EXT_BGRX_BLUE, EXT_XBGR_BLUE, EXT_XRGB_BLUE,
+ -1
+};
+
+static const int rgb_pixelsize[JPEG_NUMCS] = {
+ -1, -1, RGB_PIXELSIZE, -1, -1, -1, EXT_RGB_PIXELSIZE, EXT_RGBX_PIXELSIZE,
+ EXT_BGR_PIXELSIZE, EXT_BGRX_PIXELSIZE, EXT_XBGR_PIXELSIZE, EXT_XRGB_PIXELSIZE,
+ EXT_RGBX_PIXELSIZE, EXT_BGRX_PIXELSIZE, EXT_XBGR_PIXELSIZE, EXT_XRGB_PIXELSIZE,
+ -1
+};
+
+/* Definitions for speed-related optimizations. */
+
+/* On some machines (notably 68000 series) "int" is 32 bits, but multiplying
+ * two 16-bit shorts is faster than multiplying two ints. Define MULTIPLIER
+ * as short on such a machine. MULTIPLIER must be at least 16 bits wide.
+ */
+
+#ifndef MULTIPLIER
+#ifndef WITH_SIMD
+#define MULTIPLIER int /* type for fastest integer multiply */
+#else
+#define MULTIPLIER short /* prefer 16-bit with SIMD for parellelism */
+#endif
+#endif
+
+
+/* FAST_FLOAT should be either float or double, whichever is done faster
+ * by your compiler. (Note that this type is only used in the floating point
+ * DCT routines, so it only matters if you've defined DCT_FLOAT_SUPPORTED.)
+ */
+
+#ifndef FAST_FLOAT
+#define FAST_FLOAT float
+#endif
+
+#endif /* JPEG_INTERNAL_OPTIONS */
diff --git a/media/libjpeg/jpeg_nbits_table.h b/media/libjpeg/jpeg_nbits_table.h
new file mode 100644
index 0000000000..fcf73878c3
--- /dev/null
+++ b/media/libjpeg/jpeg_nbits_table.h
@@ -0,0 +1,4098 @@
+static const unsigned char jpeg_nbits_table[65536] = {
+ 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+};
diff --git a/media/libjpeg/jpegcomp.h b/media/libjpeg/jpegcomp.h
new file mode 100644
index 0000000000..c4834ac0df
--- /dev/null
+++ b/media/libjpeg/jpegcomp.h
@@ -0,0 +1,32 @@
+/*
+ * jpegcomp.h
+ *
+ * Copyright (C) 2010, 2020, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * JPEG compatibility macros
+ * These declarations are considered internal to the JPEG library; most
+ * applications using the library shouldn't need to include this file.
+ */
+
+#if JPEG_LIB_VERSION >= 70
+#define _DCT_scaled_size DCT_h_scaled_size
+#define _DCT_h_scaled_size DCT_h_scaled_size
+#define _DCT_v_scaled_size DCT_v_scaled_size
+#define _min_DCT_scaled_size min_DCT_h_scaled_size
+#define _min_DCT_h_scaled_size min_DCT_h_scaled_size
+#define _min_DCT_v_scaled_size min_DCT_v_scaled_size
+#define _jpeg_width jpeg_width
+#define _jpeg_height jpeg_height
+#define JERR_ARITH_NOTIMPL JERR_NOT_COMPILED
+#else
+#define _DCT_scaled_size DCT_scaled_size
+#define _DCT_h_scaled_size DCT_scaled_size
+#define _DCT_v_scaled_size DCT_scaled_size
+#define _min_DCT_scaled_size min_DCT_scaled_size
+#define _min_DCT_h_scaled_size min_DCT_scaled_size
+#define _min_DCT_v_scaled_size min_DCT_scaled_size
+#define _jpeg_width image_width
+#define _jpeg_height image_height
+#endif
diff --git a/media/libjpeg/jpegint.h b/media/libjpeg/jpegint.h
new file mode 100644
index 0000000000..6af9e2a179
--- /dev/null
+++ b/media/libjpeg/jpegint.h
@@ -0,0 +1,375 @@
+/*
+ * jpegint.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015-2016, 2019, 2021, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2021, Alex Richardson.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides common declarations for the various JPEG modules.
+ * These declarations are considered internal to the JPEG library; most
+ * applications using the library shouldn't need to include this file.
+ */
+
+
+/* Declarations for both compression & decompression */
+
+typedef enum { /* Operating modes for buffer controllers */
+ JBUF_PASS_THRU, /* Plain stripwise operation */
+ /* Remaining modes require a full-image buffer to have been created */
+ JBUF_SAVE_SOURCE, /* Run source subobject only, save output */
+ JBUF_CRANK_DEST, /* Run dest subobject only, using saved data */
+ JBUF_SAVE_AND_PASS /* Run both subobjects, save output */
+} J_BUF_MODE;
+
+/* Values of global_state field (jdapi.c has some dependencies on ordering!) */
+#define CSTATE_START 100 /* after create_compress */
+#define CSTATE_SCANNING 101 /* start_compress done, write_scanlines OK */
+#define CSTATE_RAW_OK 102 /* start_compress done, write_raw_data OK */
+#define CSTATE_WRCOEFS 103 /* jpeg_write_coefficients done */
+#define DSTATE_START 200 /* after create_decompress */
+#define DSTATE_INHEADER 201 /* reading header markers, no SOS yet */
+#define DSTATE_READY 202 /* found SOS, ready for start_decompress */
+#define DSTATE_PRELOAD 203 /* reading multiscan file in start_decompress*/
+#define DSTATE_PRESCAN 204 /* performing dummy pass for 2-pass quant */
+#define DSTATE_SCANNING 205 /* start_decompress done, read_scanlines OK */
+#define DSTATE_RAW_OK 206 /* start_decompress done, read_raw_data OK */
+#define DSTATE_BUFIMAGE 207 /* expecting jpeg_start_output */
+#define DSTATE_BUFPOST 208 /* looking for SOS/EOI in jpeg_finish_output */
+#define DSTATE_RDCOEFS 209 /* reading file in jpeg_read_coefficients */
+#define DSTATE_STOPPING 210 /* looking for EOI in jpeg_finish_decompress */
+
+
+/* JLONG must hold at least signed 32-bit values. */
+typedef long JLONG;
+
+/* JUINTPTR must hold pointer values. */
+#ifdef __UINTPTR_TYPE__
+/*
+ * __UINTPTR_TYPE__ is GNU-specific and available in GCC 4.6+ and Clang 3.0+.
+ * Fortunately, that is sufficient to support the few architectures for which
+ * sizeof(void *) != sizeof(size_t). The only other options would require C99
+ * or Clang-specific builtins.
+ */
+typedef __UINTPTR_TYPE__ JUINTPTR;
+#else
+typedef size_t JUINTPTR;
+#endif
+
+/*
+ * Left shift macro that handles a negative operand without causing any
+ * sanitizer warnings
+ */
+
+#define LEFT_SHIFT(a, b) ((JLONG)((unsigned long)(a) << (b)))
+
+
+/* Declarations for compression modules */
+
+/* Master control module */
+struct jpeg_comp_master {
+ void (*prepare_for_pass) (j_compress_ptr cinfo);
+ void (*pass_startup) (j_compress_ptr cinfo);
+ void (*finish_pass) (j_compress_ptr cinfo);
+
+ /* State variables made visible to other modules */
+ boolean call_pass_startup; /* True if pass_startup must be called */
+ boolean is_last_pass; /* True during last pass */
+};
+
+/* Main buffer control (downsampled-data buffer) */
+struct jpeg_c_main_controller {
+ void (*start_pass) (j_compress_ptr cinfo, J_BUF_MODE pass_mode);
+ void (*process_data) (j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail);
+};
+
+/* Compression preprocessing (downsampling input buffer control) */
+struct jpeg_c_prep_controller {
+ void (*start_pass) (j_compress_ptr cinfo, J_BUF_MODE pass_mode);
+ void (*pre_process_data) (j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
+ JSAMPIMAGE output_buf,
+ JDIMENSION *out_row_group_ctr,
+ JDIMENSION out_row_groups_avail);
+};
+
+/* Coefficient buffer control */
+struct jpeg_c_coef_controller {
+ void (*start_pass) (j_compress_ptr cinfo, J_BUF_MODE pass_mode);
+ boolean (*compress_data) (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+};
+
+/* Colorspace conversion */
+struct jpeg_color_converter {
+ void (*start_pass) (j_compress_ptr cinfo);
+ void (*color_convert) (j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows);
+};
+
+/* Downsampling */
+struct jpeg_downsampler {
+ void (*start_pass) (j_compress_ptr cinfo);
+ void (*downsample) (j_compress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_index, JSAMPIMAGE output_buf,
+ JDIMENSION out_row_group_index);
+
+ boolean need_context_rows; /* TRUE if need rows above & below */
+};
+
+/* Forward DCT (also controls coefficient quantization) */
+struct jpeg_forward_dct {
+ void (*start_pass) (j_compress_ptr cinfo);
+ /* perhaps this should be an array??? */
+ void (*forward_DCT) (j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+ JDIMENSION start_row, JDIMENSION start_col,
+ JDIMENSION num_blocks);
+};
+
+/* Entropy encoding */
+struct jpeg_entropy_encoder {
+ void (*start_pass) (j_compress_ptr cinfo, boolean gather_statistics);
+ boolean (*encode_mcu) (j_compress_ptr cinfo, JBLOCKROW *MCU_data);
+ void (*finish_pass) (j_compress_ptr cinfo);
+};
+
+/* Marker writing */
+struct jpeg_marker_writer {
+ void (*write_file_header) (j_compress_ptr cinfo);
+ void (*write_frame_header) (j_compress_ptr cinfo);
+ void (*write_scan_header) (j_compress_ptr cinfo);
+ void (*write_file_trailer) (j_compress_ptr cinfo);
+ void (*write_tables_only) (j_compress_ptr cinfo);
+ /* These routines are exported to allow insertion of extra markers */
+ /* Probably only COM and APPn markers should be written this way */
+ void (*write_marker_header) (j_compress_ptr cinfo, int marker,
+ unsigned int datalen);
+ void (*write_marker_byte) (j_compress_ptr cinfo, int val);
+};
+
+
+/* Declarations for decompression modules */
+
+/* Master control module */
+struct jpeg_decomp_master {
+ void (*prepare_for_output_pass) (j_decompress_ptr cinfo);
+ void (*finish_output_pass) (j_decompress_ptr cinfo);
+
+ /* State variables made visible to other modules */
+ boolean is_dummy_pass; /* True during 1st pass for 2-pass quant */
+
+ /* Partial decompression variables */
+ JDIMENSION first_iMCU_col;
+ JDIMENSION last_iMCU_col;
+ JDIMENSION first_MCU_col[MAX_COMPONENTS];
+ JDIMENSION last_MCU_col[MAX_COMPONENTS];
+ boolean jinit_upsampler_no_alloc;
+
+ /* Last iMCU row that was successfully decoded */
+ JDIMENSION last_good_iMCU_row;
+};
+
+/* Input control module */
+struct jpeg_input_controller {
+ int (*consume_input) (j_decompress_ptr cinfo);
+ void (*reset_input_controller) (j_decompress_ptr cinfo);
+ void (*start_input_pass) (j_decompress_ptr cinfo);
+ void (*finish_input_pass) (j_decompress_ptr cinfo);
+
+ /* State variables made visible to other modules */
+ boolean has_multiple_scans; /* True if file has multiple scans */
+ boolean eoi_reached; /* True when EOI has been consumed */
+};
+
+/* Main buffer control (downsampled-data buffer) */
+struct jpeg_d_main_controller {
+ void (*start_pass) (j_decompress_ptr cinfo, J_BUF_MODE pass_mode);
+ void (*process_data) (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+};
+
+/* Coefficient buffer control */
+struct jpeg_d_coef_controller {
+ void (*start_input_pass) (j_decompress_ptr cinfo);
+ int (*consume_data) (j_decompress_ptr cinfo);
+ void (*start_output_pass) (j_decompress_ptr cinfo);
+ int (*decompress_data) (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+ /* Pointer to array of coefficient virtual arrays, or NULL if none */
+ jvirt_barray_ptr *coef_arrays;
+};
+
+/* Decompression postprocessing (color quantization buffer control) */
+struct jpeg_d_post_controller {
+ void (*start_pass) (j_decompress_ptr cinfo, J_BUF_MODE pass_mode);
+ void (*post_process_data) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail,
+ JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+ JDIMENSION out_rows_avail);
+};
+
+/* Marker reading & parsing */
+struct jpeg_marker_reader {
+ void (*reset_marker_reader) (j_decompress_ptr cinfo);
+ /* Read markers until SOS or EOI.
+ * Returns same codes as are defined for jpeg_consume_input:
+ * JPEG_SUSPENDED, JPEG_REACHED_SOS, or JPEG_REACHED_EOI.
+ */
+ int (*read_markers) (j_decompress_ptr cinfo);
+ /* Read a restart marker --- exported for use by entropy decoder only */
+ jpeg_marker_parser_method read_restart_marker;
+
+ /* State of marker reader --- nominally internal, but applications
+ * supplying COM or APPn handlers might like to know the state.
+ */
+ boolean saw_SOI; /* found SOI? */
+ boolean saw_SOF; /* found SOF? */
+ int next_restart_num; /* next restart number expected (0-7) */
+ unsigned int discarded_bytes; /* # of bytes skipped looking for a marker */
+};
+
+/* Entropy decoding */
+struct jpeg_entropy_decoder {
+ void (*start_pass) (j_decompress_ptr cinfo);
+ boolean (*decode_mcu) (j_decompress_ptr cinfo, JBLOCKROW *MCU_data);
+
+ /* This is here to share code between baseline and progressive decoders; */
+ /* other modules probably should not use it */
+ boolean insufficient_data; /* set TRUE after emitting warning */
+};
+
+/* Inverse DCT (also performs dequantization) */
+typedef void (*inverse_DCT_method_ptr) (j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+struct jpeg_inverse_dct {
+ void (*start_pass) (j_decompress_ptr cinfo);
+ /* It is useful to allow each component to have a separate IDCT method. */
+ inverse_DCT_method_ptr inverse_DCT[MAX_COMPONENTS];
+};
+
+/* Upsampling (note that upsampler must also call color converter) */
+struct jpeg_upsampler {
+ void (*start_pass) (j_decompress_ptr cinfo);
+ void (*upsample) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION *in_row_group_ctr,
+ JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+ JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+
+ boolean need_context_rows; /* TRUE if need rows above & below */
+};
+
+/* Colorspace conversion */
+struct jpeg_color_deconverter {
+ void (*start_pass) (j_decompress_ptr cinfo);
+ void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows);
+};
+
+/* Color quantization or color precision reduction */
+struct jpeg_color_quantizer {
+ void (*start_pass) (j_decompress_ptr cinfo, boolean is_pre_scan);
+ void (*color_quantize) (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows);
+ void (*finish_pass) (j_decompress_ptr cinfo);
+ void (*new_color_map) (j_decompress_ptr cinfo);
+};
+
+
+/* Miscellaneous useful macros */
+
+#undef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#undef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+
+/* We assume that right shift corresponds to signed division by 2 with
+ * rounding towards minus infinity. This is correct for typical "arithmetic
+ * shift" instructions that shift in copies of the sign bit. But some
+ * C compilers implement >> with an unsigned shift. For these machines you
+ * must define RIGHT_SHIFT_IS_UNSIGNED.
+ * RIGHT_SHIFT provides a proper signed right shift of a JLONG quantity.
+ * It is only applied with constant shift counts. SHIFT_TEMPS must be
+ * included in the variables of any routine using RIGHT_SHIFT.
+ */
+
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define SHIFT_TEMPS JLONG shift_temp;
+#define RIGHT_SHIFT(x, shft) \
+ ((shift_temp = (x)) < 0 ? \
+ (shift_temp >> (shft)) | ((~((JLONG)0)) << (32 - (shft))) : \
+ (shift_temp >> (shft)))
+#else
+#define SHIFT_TEMPS
+#define RIGHT_SHIFT(x, shft) ((x) >> (shft))
+#endif
+
+
+/* Compression module initialization routines */
+EXTERN(void) jinit_compress_master(j_compress_ptr cinfo);
+EXTERN(void) jinit_c_master_control(j_compress_ptr cinfo,
+ boolean transcode_only);
+EXTERN(void) jinit_c_main_controller(j_compress_ptr cinfo,
+ boolean need_full_buffer);
+EXTERN(void) jinit_c_prep_controller(j_compress_ptr cinfo,
+ boolean need_full_buffer);
+EXTERN(void) jinit_c_coef_controller(j_compress_ptr cinfo,
+ boolean need_full_buffer);
+EXTERN(void) jinit_color_converter(j_compress_ptr cinfo);
+EXTERN(void) jinit_downsampler(j_compress_ptr cinfo);
+EXTERN(void) jinit_forward_dct(j_compress_ptr cinfo);
+EXTERN(void) jinit_huff_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_phuff_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_arith_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_marker_writer(j_compress_ptr cinfo);
+/* Decompression module initialization routines */
+EXTERN(void) jinit_master_decompress(j_decompress_ptr cinfo);
+EXTERN(void) jinit_d_main_controller(j_decompress_ptr cinfo,
+ boolean need_full_buffer);
+EXTERN(void) jinit_d_coef_controller(j_decompress_ptr cinfo,
+ boolean need_full_buffer);
+EXTERN(void) jinit_d_post_controller(j_decompress_ptr cinfo,
+ boolean need_full_buffer);
+EXTERN(void) jinit_input_controller(j_decompress_ptr cinfo);
+EXTERN(void) jinit_marker_reader(j_decompress_ptr cinfo);
+EXTERN(void) jinit_huff_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_phuff_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_arith_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_inverse_dct(j_decompress_ptr cinfo);
+EXTERN(void) jinit_upsampler(j_decompress_ptr cinfo);
+EXTERN(void) jinit_color_deconverter(j_decompress_ptr cinfo);
+EXTERN(void) jinit_1pass_quantizer(j_decompress_ptr cinfo);
+EXTERN(void) jinit_2pass_quantizer(j_decompress_ptr cinfo);
+EXTERN(void) jinit_merged_upsampler(j_decompress_ptr cinfo);
+/* Memory manager initialization */
+EXTERN(void) jinit_memory_mgr(j_common_ptr cinfo);
+
+/* Utility routines in jutils.c */
+EXTERN(long) jdiv_round_up(long a, long b);
+EXTERN(long) jround_up(long a, long b);
+EXTERN(void) jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
+ JSAMPARRAY output_array, int dest_row,
+ int num_rows, JDIMENSION num_cols);
+EXTERN(void) jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
+ JDIMENSION num_blocks);
+EXTERN(void) jzero_far(void *target, size_t bytestozero);
+/* Constant tables in jutils.c */
+#if 0 /* This table is not actually needed in v6a */
+extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
+#endif
+extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
+
+/* Arithmetic coding probability estimation tables in jaricom.c */
+extern const JLONG jpeg_aritab[];
diff --git a/media/libjpeg/jpeglib.h b/media/libjpeg/jpeglib.h
new file mode 100644
index 0000000000..d7664f0630
--- /dev/null
+++ b/media/libjpeg/jpeglib.h
@@ -0,0 +1,1132 @@
+/*
+ * jpeglib.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009-2011, 2013-2014, 2016-2017, 2020, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file defines the application interface for the JPEG library.
+ * Most applications using the library need only include this file,
+ * and perhaps jerror.h if they want to know the exact error codes.
+ */
+
+#ifndef JPEGLIB_H
+#define JPEGLIB_H
+
+/*
+ * First we include the configuration files that record how this
+ * installation of the JPEG library is set up. jconfig.h can be
+ * generated automatically for many systems. jmorecfg.h contains
+ * manual configuration options that most people need not worry about.
+ */
+
+#ifndef JCONFIG_INCLUDED /* in case jinclude.h already did */
+#include "jconfig.h" /* widely used configuration options */
+#endif
+#include "jmorecfg.h" /* seldom changed options */
+
+
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+extern "C" {
+#endif
+#endif
+
+
+/* Various constants determining the sizes of things.
+ * All of these are specified by the JPEG standard, so don't change them
+ * if you want to be compatible.
+ */
+
+#define DCTSIZE 8 /* The basic DCT block is 8x8 samples */
+#define DCTSIZE2 64 /* DCTSIZE squared; # of elements in a block */
+#define NUM_QUANT_TBLS 4 /* Quantization tables are numbered 0..3 */
+#define NUM_HUFF_TBLS 4 /* Huffman tables are numbered 0..3 */
+#define NUM_ARITH_TBLS 16 /* Arith-coding tables are numbered 0..15 */
+#define MAX_COMPS_IN_SCAN 4 /* JPEG limit on # of components in one scan */
+#define MAX_SAMP_FACTOR 4 /* JPEG limit on sampling factors */
+/* Unfortunately, some bozo at Adobe saw no reason to be bound by the standard;
+ * the PostScript DCT filter can emit files with many more than 10 blocks/MCU.
+ * If you happen to run across such a file, you can up D_MAX_BLOCKS_IN_MCU
+ * to handle it. We even let you do this from the jconfig.h file. However,
+ * we strongly discourage changing C_MAX_BLOCKS_IN_MCU; just because Adobe
+ * sometimes emits noncompliant files doesn't mean you should too.
+ */
+#define C_MAX_BLOCKS_IN_MCU 10 /* compressor's limit on blocks per MCU */
+#ifndef D_MAX_BLOCKS_IN_MCU
+#define D_MAX_BLOCKS_IN_MCU 10 /* decompressor's limit on blocks per MCU */
+#endif
+
+
+/* Data structures for images (arrays of samples and of DCT coefficients).
+ */
+
+typedef JSAMPLE *JSAMPROW; /* ptr to one image row of pixel samples. */
+typedef JSAMPROW *JSAMPARRAY; /* ptr to some rows (a 2-D sample array) */
+typedef JSAMPARRAY *JSAMPIMAGE; /* a 3-D sample array: top index is color */
+
+typedef JCOEF JBLOCK[DCTSIZE2]; /* one block of coefficients */
+typedef JBLOCK *JBLOCKROW; /* pointer to one row of coefficient blocks */
+typedef JBLOCKROW *JBLOCKARRAY; /* a 2-D array of coefficient blocks */
+typedef JBLOCKARRAY *JBLOCKIMAGE; /* a 3-D array of coefficient blocks */
+
+typedef JCOEF *JCOEFPTR; /* useful in a couple of places */
+
+
+/* Types for JPEG compression parameters and working tables. */
+
+
+/* DCT coefficient quantization tables. */
+
+typedef struct {
+ /* This array gives the coefficient quantizers in natural array order
+ * (not the zigzag order in which they are stored in a JPEG DQT marker).
+ * CAUTION: IJG versions prior to v6a kept this array in zigzag order.
+ */
+ UINT16 quantval[DCTSIZE2]; /* quantization step for each coefficient */
+ /* This field is used only during compression. It's initialized FALSE when
+ * the table is created, and set TRUE when it's been output to the file.
+ * You could suppress output of a table by setting this to TRUE.
+ * (See jpeg_suppress_tables for an example.)
+ */
+ boolean sent_table; /* TRUE when table has been output */
+} JQUANT_TBL;
+
+
+/* Huffman coding tables. */
+
+typedef struct {
+ /* These two fields directly represent the contents of a JPEG DHT marker */
+ UINT8 bits[17]; /* bits[k] = # of symbols with codes of */
+ /* length k bits; bits[0] is unused */
+ UINT8 huffval[256]; /* The symbols, in order of incr code length */
+ /* This field is used only during compression. It's initialized FALSE when
+ * the table is created, and set TRUE when it's been output to the file.
+ * You could suppress output of a table by setting this to TRUE.
+ * (See jpeg_suppress_tables for an example.)
+ */
+ boolean sent_table; /* TRUE when table has been output */
+} JHUFF_TBL;
+
+
+/* Basic info about one component (color channel). */
+
+typedef struct {
+ /* These values are fixed over the whole image. */
+ /* For compression, they must be supplied by parameter setup; */
+ /* for decompression, they are read from the SOF marker. */
+ int component_id; /* identifier for this component (0..255) */
+ int component_index; /* its index in SOF or cinfo->comp_info[] */
+ int h_samp_factor; /* horizontal sampling factor (1..4) */
+ int v_samp_factor; /* vertical sampling factor (1..4) */
+ int quant_tbl_no; /* quantization table selector (0..3) */
+ /* These values may vary between scans. */
+ /* For compression, they must be supplied by parameter setup; */
+ /* for decompression, they are read from the SOS marker. */
+ /* The decompressor output side may not use these variables. */
+ int dc_tbl_no; /* DC entropy table selector (0..3) */
+ int ac_tbl_no; /* AC entropy table selector (0..3) */
+
+ /* Remaining fields should be treated as private by applications. */
+
+ /* These values are computed during compression or decompression startup: */
+ /* Component's size in DCT blocks.
+ * Any dummy blocks added to complete an MCU are not counted; therefore
+ * these values do not depend on whether a scan is interleaved or not.
+ */
+ JDIMENSION width_in_blocks;
+ JDIMENSION height_in_blocks;
+ /* Size of a DCT block in samples. Always DCTSIZE for compression.
+ * For decompression this is the size of the output from one DCT block,
+ * reflecting any scaling we choose to apply during the IDCT step.
+ * Values from 1 to 16 are supported.
+ * Note that different components may receive different IDCT scalings.
+ */
+#if JPEG_LIB_VERSION >= 70
+ int DCT_h_scaled_size;
+ int DCT_v_scaled_size;
+#else
+ int DCT_scaled_size;
+#endif
+ /* The downsampled dimensions are the component's actual, unpadded number
+ * of samples at the main buffer (preprocessing/compression interface), thus
+ * downsampled_width = ceil(image_width * Hi/Hmax)
+ * and similarly for height. For decompression, IDCT scaling is included, so
+ * downsampled_width = ceil(image_width * Hi/Hmax * DCT_[h_]scaled_size/DCTSIZE)
+ */
+ JDIMENSION downsampled_width; /* actual width in samples */
+ JDIMENSION downsampled_height; /* actual height in samples */
+ /* This flag is used only for decompression. In cases where some of the
+ * components will be ignored (eg grayscale output from YCbCr image),
+ * we can skip most computations for the unused components.
+ */
+ boolean component_needed; /* do we need the value of this component? */
+
+ /* These values are computed before starting a scan of the component. */
+ /* The decompressor output side may not use these variables. */
+ int MCU_width; /* number of blocks per MCU, horizontally */
+ int MCU_height; /* number of blocks per MCU, vertically */
+ int MCU_blocks; /* MCU_width * MCU_height */
+ int MCU_sample_width; /* MCU width in samples, MCU_width*DCT_[h_]scaled_size */
+ int last_col_width; /* # of non-dummy blocks across in last MCU */
+ int last_row_height; /* # of non-dummy blocks down in last MCU */
+
+ /* Saved quantization table for component; NULL if none yet saved.
+ * See jdinput.c comments about the need for this information.
+ * This field is currently used only for decompression.
+ */
+ JQUANT_TBL *quant_table;
+
+ /* Private per-component storage for DCT or IDCT subsystem. */
+ void *dct_table;
+} jpeg_component_info;
+
+
+/* The script for encoding a multiple-scan file is an array of these: */
+
+typedef struct {
+ int comps_in_scan; /* number of components encoded in this scan */
+ int component_index[MAX_COMPS_IN_SCAN]; /* their SOF/comp_info[] indexes */
+ int Ss, Se; /* progressive JPEG spectral selection parms */
+ int Ah, Al; /* progressive JPEG successive approx. parms */
+} jpeg_scan_info;
+
+/* The decompressor can save APPn and COM markers in a list of these: */
+
+typedef struct jpeg_marker_struct *jpeg_saved_marker_ptr;
+
+struct jpeg_marker_struct {
+ jpeg_saved_marker_ptr next; /* next in list, or NULL */
+ UINT8 marker; /* marker code: JPEG_COM, or JPEG_APP0+n */
+ unsigned int original_length; /* # bytes of data in the file */
+ unsigned int data_length; /* # bytes of data saved at data[] */
+ JOCTET *data; /* the data contained in the marker */
+ /* the marker length word is not counted in data_length or original_length */
+};
+
+/* Known color spaces. */
+
+#define JCS_EXTENSIONS 1
+#define JCS_ALPHA_EXTENSIONS 1
+
+typedef enum {
+ JCS_UNKNOWN, /* error/unspecified */
+ JCS_GRAYSCALE, /* monochrome */
+ JCS_RGB, /* red/green/blue as specified by the RGB_RED,
+ RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE macros */
+ JCS_YCbCr, /* Y/Cb/Cr (also known as YUV) */
+ JCS_CMYK, /* C/M/Y/K */
+ JCS_YCCK, /* Y/Cb/Cr/K */
+ JCS_EXT_RGB, /* red/green/blue */
+ JCS_EXT_RGBX, /* red/green/blue/x */
+ JCS_EXT_BGR, /* blue/green/red */
+ JCS_EXT_BGRX, /* blue/green/red/x */
+ JCS_EXT_XBGR, /* x/blue/green/red */
+ JCS_EXT_XRGB, /* x/red/green/blue */
+ /* When out_color_space it set to JCS_EXT_RGBX, JCS_EXT_BGRX, JCS_EXT_XBGR,
+ or JCS_EXT_XRGB during decompression, the X byte is undefined, and in
+ order to ensure the best performance, libjpeg-turbo can set that byte to
+ whatever value it wishes. Use the following colorspace constants to
+ ensure that the X byte is set to 0xFF, so that it can be interpreted as an
+ opaque alpha channel. */
+ JCS_EXT_RGBA, /* red/green/blue/alpha */
+ JCS_EXT_BGRA, /* blue/green/red/alpha */
+ JCS_EXT_ABGR, /* alpha/blue/green/red */
+ JCS_EXT_ARGB, /* alpha/red/green/blue */
+ JCS_RGB565 /* 5-bit red/6-bit green/5-bit blue */
+} J_COLOR_SPACE;
+
+/* DCT/IDCT algorithm options. */
+
+typedef enum {
+ JDCT_ISLOW, /* accurate integer method */
+ JDCT_IFAST, /* less accurate integer method [legacy feature] */
+ JDCT_FLOAT /* floating-point method [legacy feature] */
+} J_DCT_METHOD;
+
+#ifndef JDCT_DEFAULT /* may be overridden in jconfig.h */
+#define JDCT_DEFAULT JDCT_ISLOW
+#endif
+#ifndef JDCT_FASTEST /* may be overridden in jconfig.h */
+#define JDCT_FASTEST JDCT_IFAST
+#endif
+
+/* Dithering options for decompression. */
+
+typedef enum {
+ JDITHER_NONE, /* no dithering */
+ JDITHER_ORDERED, /* simple ordered dither */
+ JDITHER_FS /* Floyd-Steinberg error diffusion dither */
+} J_DITHER_MODE;
+
+
+/* Common fields between JPEG compression and decompression master structs. */
+
+#define jpeg_common_fields \
+ struct jpeg_error_mgr *err; /* Error handler module */ \
+ struct jpeg_memory_mgr *mem; /* Memory manager module */ \
+ struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */ \
+ void *client_data; /* Available for use by application */ \
+ boolean is_decompressor; /* So common code can tell which is which */ \
+ int global_state /* For checking call sequence validity */
+
+/* Routines that are to be used by both halves of the library are declared
+ * to receive a pointer to this structure. There are no actual instances of
+ * jpeg_common_struct, only of jpeg_compress_struct and jpeg_decompress_struct.
+ */
+struct jpeg_common_struct {
+ jpeg_common_fields; /* Fields common to both master struct types */
+ /* Additional fields follow in an actual jpeg_compress_struct or
+ * jpeg_decompress_struct. All three structs must agree on these
+ * initial fields! (This would be a lot cleaner in C++.)
+ */
+};
+
+typedef struct jpeg_common_struct *j_common_ptr;
+typedef struct jpeg_compress_struct *j_compress_ptr;
+typedef struct jpeg_decompress_struct *j_decompress_ptr;
+
+
+/* Master record for a compression instance */
+
+struct jpeg_compress_struct {
+ jpeg_common_fields; /* Fields shared with jpeg_decompress_struct */
+
+ /* Destination for compressed data */
+ struct jpeg_destination_mgr *dest;
+
+ /* Description of source image --- these fields must be filled in by
+ * outer application before starting compression. in_color_space must
+ * be correct before you can even call jpeg_set_defaults().
+ */
+
+ JDIMENSION image_width; /* input image width */
+ JDIMENSION image_height; /* input image height */
+ int input_components; /* # of color components in input image */
+ J_COLOR_SPACE in_color_space; /* colorspace of input image */
+
+ double input_gamma; /* image gamma of input image */
+
+ /* Compression parameters --- these fields must be set before calling
+ * jpeg_start_compress(). We recommend calling jpeg_set_defaults() to
+ * initialize everything to reasonable defaults, then changing anything
+ * the application specifically wants to change. That way you won't get
+ * burnt when new parameters are added. Also note that there are several
+ * helper routines to simplify changing parameters.
+ */
+
+#if JPEG_LIB_VERSION >= 70
+ unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+
+ JDIMENSION jpeg_width; /* scaled JPEG image width */
+ JDIMENSION jpeg_height; /* scaled JPEG image height */
+ /* Dimensions of actual JPEG image that will be written to file,
+ * derived from input dimensions by scaling factors above.
+ * These fields are computed by jpeg_start_compress().
+ * You can also use jpeg_calc_jpeg_dimensions() to determine these values
+ * in advance of calling jpeg_start_compress().
+ */
+#endif
+
+ int data_precision; /* bits of precision in image data */
+
+ int num_components; /* # of color components in JPEG image */
+ J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */
+
+ jpeg_component_info *comp_info;
+ /* comp_info[i] describes component that appears i'th in SOF */
+
+ JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS];
+#if JPEG_LIB_VERSION >= 70
+ int q_scale_factor[NUM_QUANT_TBLS];
+#endif
+ /* ptrs to coefficient quantization tables, or NULL if not defined,
+ * and corresponding scale factors (percentage, initialized 100).
+ */
+
+ JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
+ JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
+ /* ptrs to Huffman coding tables, or NULL if not defined */
+
+ UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
+ UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
+ UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
+
+ int num_scans; /* # of entries in scan_info array */
+ const jpeg_scan_info *scan_info; /* script for multi-scan file, or NULL */
+ /* The default value of scan_info is NULL, which causes a single-scan
+ * sequential JPEG file to be emitted. To create a multi-scan file,
+ * set num_scans and scan_info to point to an array of scan definitions.
+ */
+
+ boolean raw_data_in; /* TRUE=caller supplies downsampled data */
+ boolean arith_code; /* TRUE=arithmetic coding, FALSE=Huffman */
+ boolean optimize_coding; /* TRUE=optimize entropy encoding parms */
+ boolean CCIR601_sampling; /* TRUE=first samples are cosited */
+#if JPEG_LIB_VERSION >= 70
+ boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */
+#endif
+ int smoothing_factor; /* 1..100, or 0 for no input smoothing */
+ J_DCT_METHOD dct_method; /* DCT algorithm selector */
+
+ /* The restart interval can be specified in absolute MCUs by setting
+ * restart_interval, or in MCU rows by setting restart_in_rows
+ * (in which case the correct restart_interval will be figured
+ * for each scan).
+ */
+ unsigned int restart_interval; /* MCUs per restart, or 0 for no restart */
+ int restart_in_rows; /* if > 0, MCU rows per restart interval */
+
+ /* Parameters controlling emission of special markers. */
+
+ boolean write_JFIF_header; /* should a JFIF marker be written? */
+ UINT8 JFIF_major_version; /* What to write for the JFIF version number */
+ UINT8 JFIF_minor_version;
+ /* These three values are not used by the JPEG code, merely copied */
+ /* into the JFIF APP0 marker. density_unit can be 0 for unknown, */
+ /* 1 for dots/inch, or 2 for dots/cm. Note that the pixel aspect */
+ /* ratio is defined by X_density/Y_density even when density_unit=0. */
+ UINT8 density_unit; /* JFIF code for pixel size units */
+ UINT16 X_density; /* Horizontal pixel density */
+ UINT16 Y_density; /* Vertical pixel density */
+ boolean write_Adobe_marker; /* should an Adobe marker be written? */
+
+ /* State variable: index of next scanline to be written to
+ * jpeg_write_scanlines(). Application may use this to control its
+ * processing loop, e.g., "while (next_scanline < image_height)".
+ */
+
+ JDIMENSION next_scanline; /* 0 .. image_height-1 */
+
+ /* Remaining fields are known throughout compressor, but generally
+ * should not be touched by a surrounding application.
+ */
+
+ /*
+ * These fields are computed during compression startup
+ */
+ boolean progressive_mode; /* TRUE if scan script uses progressive mode */
+ int max_h_samp_factor; /* largest h_samp_factor */
+ int max_v_samp_factor; /* largest v_samp_factor */
+
+#if JPEG_LIB_VERSION >= 70
+ int min_DCT_h_scaled_size; /* smallest DCT_h_scaled_size of any component */
+ int min_DCT_v_scaled_size; /* smallest DCT_v_scaled_size of any component */
+#endif
+
+ JDIMENSION total_iMCU_rows; /* # of iMCU rows to be input to coef ctlr */
+ /* The coefficient controller receives data in units of MCU rows as defined
+ * for fully interleaved scans (whether the JPEG file is interleaved or not).
+ * There are v_samp_factor * DCTSIZE sample rows of each component in an
+ * "iMCU" (interleaved MCU) row.
+ */
+
+ /*
+ * These fields are valid during any one scan.
+ * They describe the components and MCUs actually appearing in the scan.
+ */
+ int comps_in_scan; /* # of JPEG components in this scan */
+ jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN];
+ /* *cur_comp_info[i] describes component that appears i'th in SOS */
+
+ JDIMENSION MCUs_per_row; /* # of MCUs across the image */
+ JDIMENSION MCU_rows_in_scan; /* # of MCU rows in the image */
+
+ int blocks_in_MCU; /* # of DCT blocks per MCU */
+ int MCU_membership[C_MAX_BLOCKS_IN_MCU];
+ /* MCU_membership[i] is index in cur_comp_info of component owning */
+ /* i'th block in an MCU */
+
+ int Ss, Se, Ah, Al; /* progressive JPEG parameters for scan */
+
+#if JPEG_LIB_VERSION >= 80
+ int block_size; /* the basic DCT block size: 1..16 */
+ const int *natural_order; /* natural-order position array */
+ int lim_Se; /* min( Se, DCTSIZE2-1 ) */
+#endif
+
+ /*
+ * Links to compression subobjects (methods and private variables of modules)
+ */
+ struct jpeg_comp_master *master;
+ struct jpeg_c_main_controller *main;
+ struct jpeg_c_prep_controller *prep;
+ struct jpeg_c_coef_controller *coef;
+ struct jpeg_marker_writer *marker;
+ struct jpeg_color_converter *cconvert;
+ struct jpeg_downsampler *downsample;
+ struct jpeg_forward_dct *fdct;
+ struct jpeg_entropy_encoder *entropy;
+ jpeg_scan_info *script_space; /* workspace for jpeg_simple_progression */
+ int script_space_size;
+};
+
+
+/* Master record for a decompression instance */
+
+struct jpeg_decompress_struct {
+ jpeg_common_fields; /* Fields shared with jpeg_compress_struct */
+
+ /* Source of compressed data */
+ struct jpeg_source_mgr *src;
+
+ /* Basic description of image --- filled in by jpeg_read_header(). */
+ /* Application may inspect these values to decide how to process image. */
+
+ JDIMENSION image_width; /* nominal image width (from SOF marker) */
+ JDIMENSION image_height; /* nominal image height */
+ int num_components; /* # of color components in JPEG image */
+ J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */
+
+ /* Decompression processing parameters --- these fields must be set before
+ * calling jpeg_start_decompress(). Note that jpeg_read_header() initializes
+ * them to default values.
+ */
+
+ J_COLOR_SPACE out_color_space; /* colorspace for output */
+
+ unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+
+ double output_gamma; /* image gamma wanted in output */
+
+ boolean buffered_image; /* TRUE=multiple output passes */
+ boolean raw_data_out; /* TRUE=downsampled data wanted */
+
+ J_DCT_METHOD dct_method; /* IDCT algorithm selector */
+ boolean do_fancy_upsampling; /* TRUE=apply fancy upsampling */
+ boolean do_block_smoothing; /* TRUE=apply interblock smoothing */
+
+ boolean quantize_colors; /* TRUE=colormapped output wanted */
+ /* the following are ignored if not quantize_colors: */
+ J_DITHER_MODE dither_mode; /* type of color dithering to use */
+ boolean two_pass_quantize; /* TRUE=use two-pass color quantization */
+ int desired_number_of_colors; /* max # colors to use in created colormap */
+ /* these are significant only in buffered-image mode: */
+ boolean enable_1pass_quant; /* enable future use of 1-pass quantizer */
+ boolean enable_external_quant;/* enable future use of external colormap */
+ boolean enable_2pass_quant; /* enable future use of 2-pass quantizer */
+
+ /* Description of actual output image that will be returned to application.
+ * These fields are computed by jpeg_start_decompress().
+ * You can also use jpeg_calc_output_dimensions() to determine these values
+ * in advance of calling jpeg_start_decompress().
+ */
+
+ JDIMENSION output_width; /* scaled image width */
+ JDIMENSION output_height; /* scaled image height */
+ int out_color_components; /* # of color components in out_color_space */
+ int output_components; /* # of color components returned */
+ /* output_components is 1 (a colormap index) when quantizing colors;
+ * otherwise it equals out_color_components.
+ */
+ int rec_outbuf_height; /* min recommended height of scanline buffer */
+ /* If the buffer passed to jpeg_read_scanlines() is less than this many rows
+ * high, space and time will be wasted due to unnecessary data copying.
+ * Usually rec_outbuf_height will be 1 or 2, at most 4.
+ */
+
+ /* When quantizing colors, the output colormap is described by these fields.
+ * The application can supply a colormap by setting colormap non-NULL before
+ * calling jpeg_start_decompress; otherwise a colormap is created during
+ * jpeg_start_decompress or jpeg_start_output.
+ * The map has out_color_components rows and actual_number_of_colors columns.
+ */
+ int actual_number_of_colors; /* number of entries in use */
+ JSAMPARRAY colormap; /* The color map as a 2-D pixel array */
+
+ /* State variables: these variables indicate the progress of decompression.
+ * The application may examine these but must not modify them.
+ */
+
+ /* Row index of next scanline to be read from jpeg_read_scanlines().
+ * Application may use this to control its processing loop, e.g.,
+ * "while (output_scanline < output_height)".
+ */
+ JDIMENSION output_scanline; /* 0 .. output_height-1 */
+
+ /* Current input scan number and number of iMCU rows completed in scan.
+ * These indicate the progress of the decompressor input side.
+ */
+ int input_scan_number; /* Number of SOS markers seen so far */
+ JDIMENSION input_iMCU_row; /* Number of iMCU rows completed */
+
+ /* The "output scan number" is the notional scan being displayed by the
+ * output side. The decompressor will not allow output scan/row number
+ * to get ahead of input scan/row, but it can fall arbitrarily far behind.
+ */
+ int output_scan_number; /* Nominal scan number being displayed */
+ JDIMENSION output_iMCU_row; /* Number of iMCU rows read */
+
+ /* Current progression status. coef_bits[c][i] indicates the precision
+ * with which component c's DCT coefficient i (in zigzag order) is known.
+ * It is -1 when no data has yet been received, otherwise it is the point
+ * transform (shift) value for the most recent scan of the coefficient
+ * (thus, 0 at completion of the progression).
+ * This pointer is NULL when reading a non-progressive file.
+ */
+ int (*coef_bits)[DCTSIZE2]; /* -1 or current Al value for each coef */
+
+ /* Internal JPEG parameters --- the application usually need not look at
+ * these fields. Note that the decompressor output side may not use
+ * any parameters that can change between scans.
+ */
+
+ /* Quantization and Huffman tables are carried forward across input
+ * datastreams when processing abbreviated JPEG datastreams.
+ */
+
+ JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS];
+ /* ptrs to coefficient quantization tables, or NULL if not defined */
+
+ JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
+ JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
+ /* ptrs to Huffman coding tables, or NULL if not defined */
+
+ /* These parameters are never carried across datastreams, since they
+ * are given in SOF/SOS markers or defined to be reset by SOI.
+ */
+
+ int data_precision; /* bits of precision in image data */
+
+ jpeg_component_info *comp_info;
+ /* comp_info[i] describes component that appears i'th in SOF */
+
+#if JPEG_LIB_VERSION >= 80
+ boolean is_baseline; /* TRUE if Baseline SOF0 encountered */
+#endif
+ boolean progressive_mode; /* TRUE if SOFn specifies progressive mode */
+ boolean arith_code; /* TRUE=arithmetic coding, FALSE=Huffman */
+
+ UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
+ UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
+ UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
+
+ unsigned int restart_interval; /* MCUs per restart interval, or 0 for no restart */
+
+ /* These fields record data obtained from optional markers recognized by
+ * the JPEG library.
+ */
+ boolean saw_JFIF_marker; /* TRUE iff a JFIF APP0 marker was found */
+ /* Data copied from JFIF marker; only valid if saw_JFIF_marker is TRUE: */
+ UINT8 JFIF_major_version; /* JFIF version number */
+ UINT8 JFIF_minor_version;
+ UINT8 density_unit; /* JFIF code for pixel size units */
+ UINT16 X_density; /* Horizontal pixel density */
+ UINT16 Y_density; /* Vertical pixel density */
+ boolean saw_Adobe_marker; /* TRUE iff an Adobe APP14 marker was found */
+ UINT8 Adobe_transform; /* Color transform code from Adobe marker */
+
+ boolean CCIR601_sampling; /* TRUE=first samples are cosited */
+
+ /* Aside from the specific data retained from APPn markers known to the
+ * library, the uninterpreted contents of any or all APPn and COM markers
+ * can be saved in a list for examination by the application.
+ */
+ jpeg_saved_marker_ptr marker_list; /* Head of list of saved markers */
+
+ /* Remaining fields are known throughout decompressor, but generally
+ * should not be touched by a surrounding application.
+ */
+
+ /*
+ * These fields are computed during decompression startup
+ */
+ int max_h_samp_factor; /* largest h_samp_factor */
+ int max_v_samp_factor; /* largest v_samp_factor */
+
+#if JPEG_LIB_VERSION >= 70
+ int min_DCT_h_scaled_size; /* smallest DCT_h_scaled_size of any component */
+ int min_DCT_v_scaled_size; /* smallest DCT_v_scaled_size of any component */
+#else
+ int min_DCT_scaled_size; /* smallest DCT_scaled_size of any component */
+#endif
+
+ JDIMENSION total_iMCU_rows; /* # of iMCU rows in image */
+ /* The coefficient controller's input and output progress is measured in
+ * units of "iMCU" (interleaved MCU) rows. These are the same as MCU rows
+ * in fully interleaved JPEG scans, but are used whether the scan is
+ * interleaved or not. We define an iMCU row as v_samp_factor DCT block
+ * rows of each component. Therefore, the IDCT output contains
+ * v_samp_factor*DCT_[v_]scaled_size sample rows of a component per iMCU row.
+ */
+
+ JSAMPLE *sample_range_limit; /* table for fast range-limiting */
+
+ /*
+ * These fields are valid during any one scan.
+ * They describe the components and MCUs actually appearing in the scan.
+ * Note that the decompressor output side must not use these fields.
+ */
+ int comps_in_scan; /* # of JPEG components in this scan */
+ jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN];
+ /* *cur_comp_info[i] describes component that appears i'th in SOS */
+
+ JDIMENSION MCUs_per_row; /* # of MCUs across the image */
+ JDIMENSION MCU_rows_in_scan; /* # of MCU rows in the image */
+
+ int blocks_in_MCU; /* # of DCT blocks per MCU */
+ int MCU_membership[D_MAX_BLOCKS_IN_MCU];
+ /* MCU_membership[i] is index in cur_comp_info of component owning */
+ /* i'th block in an MCU */
+
+ int Ss, Se, Ah, Al; /* progressive JPEG parameters for scan */
+
+#if JPEG_LIB_VERSION >= 80
+ /* These fields are derived from Se of first SOS marker.
+ */
+ int block_size; /* the basic DCT block size: 1..16 */
+ const int *natural_order; /* natural-order position array for entropy decode */
+ int lim_Se; /* min( Se, DCTSIZE2-1 ) for entropy decode */
+#endif
+
+ /* This field is shared between entropy decoder and marker parser.
+ * It is either zero or the code of a JPEG marker that has been
+ * read from the data source, but has not yet been processed.
+ */
+ int unread_marker;
+
+ /*
+ * Links to decompression subobjects (methods, private variables of modules)
+ */
+ struct jpeg_decomp_master *master;
+ struct jpeg_d_main_controller *main;
+ struct jpeg_d_coef_controller *coef;
+ struct jpeg_d_post_controller *post;
+ struct jpeg_input_controller *inputctl;
+ struct jpeg_marker_reader *marker;
+ struct jpeg_entropy_decoder *entropy;
+ struct jpeg_inverse_dct *idct;
+ struct jpeg_upsampler *upsample;
+ struct jpeg_color_deconverter *cconvert;
+ struct jpeg_color_quantizer *cquantize;
+};
+
+
+/* "Object" declarations for JPEG modules that may be supplied or called
+ * directly by the surrounding application.
+ * As with all objects in the JPEG library, these structs only define the
+ * publicly visible methods and state variables of a module. Additional
+ * private fields may exist after the public ones.
+ */
+
+
+/* Error handler object */
+
+struct jpeg_error_mgr {
+ /* Error exit handler: does not return to caller */
+ void (*error_exit) (j_common_ptr cinfo);
+ /* Conditionally emit a trace or warning message */
+ void (*emit_message) (j_common_ptr cinfo, int msg_level);
+ /* Routine that actually outputs a trace or error message */
+ void (*output_message) (j_common_ptr cinfo);
+ /* Format a message string for the most recent JPEG error or message */
+ void (*format_message) (j_common_ptr cinfo, char *buffer);
+#define JMSG_LENGTH_MAX 200 /* recommended size of format_message buffer */
+ /* Reset error state variables at start of a new image */
+ void (*reset_error_mgr) (j_common_ptr cinfo);
+
+ /* The message ID code and any parameters are saved here.
+ * A message can have one string parameter or up to 8 int parameters.
+ */
+ int msg_code;
+#define JMSG_STR_PARM_MAX 80
+ union {
+ int i[8];
+ char s[JMSG_STR_PARM_MAX];
+ } msg_parm;
+
+ /* Standard state variables for error facility */
+
+ int trace_level; /* max msg_level that will be displayed */
+
+ /* For recoverable corrupt-data errors, we emit a warning message,
+ * but keep going unless emit_message chooses to abort. emit_message
+ * should count warnings in num_warnings. The surrounding application
+ * can check for bad data by seeing if num_warnings is nonzero at the
+ * end of processing.
+ */
+ long num_warnings; /* number of corrupt-data warnings */
+
+ /* These fields point to the table(s) of error message strings.
+ * An application can change the table pointer to switch to a different
+ * message list (typically, to change the language in which errors are
+ * reported). Some applications may wish to add additional error codes
+ * that will be handled by the JPEG library error mechanism; the second
+ * table pointer is used for this purpose.
+ *
+ * First table includes all errors generated by JPEG library itself.
+ * Error code 0 is reserved for a "no such error string" message.
+ */
+ const char * const *jpeg_message_table; /* Library errors */
+ int last_jpeg_message; /* Table contains strings 0..last_jpeg_message */
+ /* Second table can be added by application (see cjpeg/djpeg for example).
+ * It contains strings numbered first_addon_message..last_addon_message.
+ */
+ const char * const *addon_message_table; /* Non-library errors */
+ int first_addon_message; /* code for first string in addon table */
+ int last_addon_message; /* code for last string in addon table */
+};
+
+
+/* Progress monitor object */
+
+struct jpeg_progress_mgr {
+ void (*progress_monitor) (j_common_ptr cinfo);
+
+ long pass_counter; /* work units completed in this pass */
+ long pass_limit; /* total number of work units in this pass */
+ int completed_passes; /* passes completed so far */
+ int total_passes; /* total number of passes expected */
+};
+
+
+/* Data destination object for compression */
+
+struct jpeg_destination_mgr {
+ JOCTET *next_output_byte; /* => next byte to write in buffer */
+ size_t free_in_buffer; /* # of byte spaces remaining in buffer */
+
+ void (*init_destination) (j_compress_ptr cinfo);
+ boolean (*empty_output_buffer) (j_compress_ptr cinfo);
+ void (*term_destination) (j_compress_ptr cinfo);
+};
+
+
+/* Data source object for decompression */
+
+struct jpeg_source_mgr {
+ const JOCTET *next_input_byte; /* => next byte to read from buffer */
+ size_t bytes_in_buffer; /* # of bytes remaining in buffer */
+
+ void (*init_source) (j_decompress_ptr cinfo);
+ boolean (*fill_input_buffer) (j_decompress_ptr cinfo);
+ void (*skip_input_data) (j_decompress_ptr cinfo, long num_bytes);
+ boolean (*resync_to_restart) (j_decompress_ptr cinfo, int desired);
+ void (*term_source) (j_decompress_ptr cinfo);
+};
+
+
+/* Memory manager object.
+ * Allocates "small" objects (a few K total), "large" objects (tens of K),
+ * and "really big" objects (virtual arrays with backing store if needed).
+ * The memory manager does not allow individual objects to be freed; rather,
+ * each created object is assigned to a pool, and whole pools can be freed
+ * at once. This is faster and more convenient than remembering exactly what
+ * to free, especially where malloc()/free() are not too speedy.
+ * NB: alloc routines never return NULL. They exit to error_exit if not
+ * successful.
+ */
+
+#define JPOOL_PERMANENT 0 /* lasts until master record is destroyed */
+#define JPOOL_IMAGE 1 /* lasts until done with image/datastream */
+#define JPOOL_NUMPOOLS 2
+
+typedef struct jvirt_sarray_control *jvirt_sarray_ptr;
+typedef struct jvirt_barray_control *jvirt_barray_ptr;
+
+
+struct jpeg_memory_mgr {
+ /* Method pointers */
+ void *(*alloc_small) (j_common_ptr cinfo, int pool_id, size_t sizeofobject);
+ void *(*alloc_large) (j_common_ptr cinfo, int pool_id,
+ size_t sizeofobject);
+ JSAMPARRAY (*alloc_sarray) (j_common_ptr cinfo, int pool_id,
+ JDIMENSION samplesperrow, JDIMENSION numrows);
+ JBLOCKARRAY (*alloc_barray) (j_common_ptr cinfo, int pool_id,
+ JDIMENSION blocksperrow, JDIMENSION numrows);
+ jvirt_sarray_ptr (*request_virt_sarray) (j_common_ptr cinfo, int pool_id,
+ boolean pre_zero,
+ JDIMENSION samplesperrow,
+ JDIMENSION numrows,
+ JDIMENSION maxaccess);
+ jvirt_barray_ptr (*request_virt_barray) (j_common_ptr cinfo, int pool_id,
+ boolean pre_zero,
+ JDIMENSION blocksperrow,
+ JDIMENSION numrows,
+ JDIMENSION maxaccess);
+ void (*realize_virt_arrays) (j_common_ptr cinfo);
+ JSAMPARRAY (*access_virt_sarray) (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
+ JDIMENSION start_row, JDIMENSION num_rows,
+ boolean writable);
+ JBLOCKARRAY (*access_virt_barray) (j_common_ptr cinfo, jvirt_barray_ptr ptr,
+ JDIMENSION start_row, JDIMENSION num_rows,
+ boolean writable);
+ void (*free_pool) (j_common_ptr cinfo, int pool_id);
+ void (*self_destruct) (j_common_ptr cinfo);
+
+ /* Limit on memory allocation for this JPEG object. (Note that this is
+ * merely advisory, not a guaranteed maximum; it only affects the space
+ * used for virtual-array buffers.) May be changed by outer application
+ * after creating the JPEG object.
+ */
+ long max_memory_to_use;
+
+ /* Maximum allocation request accepted by alloc_large. */
+ long max_alloc_chunk;
+};
+
+
+/* Routine signature for application-supplied marker processing methods.
+ * Need not pass marker code since it is stored in cinfo->unread_marker.
+ */
+typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo);
+
+
+/* Originally, this macro was used as a way of defining function prototypes
+ * for both modern compilers as well as older compilers that did not support
+ * prototype parameters. libjpeg-turbo has never supported these older,
+ * non-ANSI compilers, but the macro is still included because there is some
+ * software out there that uses it.
+ */
+
+#define JPP(arglist) arglist
+
+
+/* Default error-management setup */
+EXTERN(struct jpeg_error_mgr *) jpeg_std_error(struct jpeg_error_mgr *err);
+
+/* Initialization of JPEG compression objects.
+ * jpeg_create_compress() and jpeg_create_decompress() are the exported
+ * names that applications should call. These expand to calls on
+ * jpeg_CreateCompress and jpeg_CreateDecompress with additional information
+ * passed for version mismatch checking.
+ * NB: you must set up the error-manager BEFORE calling jpeg_create_xxx.
+ */
+#define jpeg_create_compress(cinfo) \
+ jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
+ (size_t)sizeof(struct jpeg_compress_struct))
+#define jpeg_create_decompress(cinfo) \
+ jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
+ (size_t)sizeof(struct jpeg_decompress_struct))
+EXTERN(void) jpeg_CreateCompress(j_compress_ptr cinfo, int version,
+ size_t structsize);
+EXTERN(void) jpeg_CreateDecompress(j_decompress_ptr cinfo, int version,
+ size_t structsize);
+/* Destruction of JPEG compression objects */
+EXTERN(void) jpeg_destroy_compress(j_compress_ptr cinfo);
+EXTERN(void) jpeg_destroy_decompress(j_decompress_ptr cinfo);
+
+/* Standard data source and destination managers: stdio streams. */
+/* Caller is responsible for opening the file before and closing after. */
+EXTERN(void) jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile);
+EXTERN(void) jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile);
+
+#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
+/* Data source and destination managers: memory buffers. */
+EXTERN(void) jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+ unsigned long *outsize);
+EXTERN(void) jpeg_mem_src(j_decompress_ptr cinfo,
+ const unsigned char *inbuffer, unsigned long insize);
+#endif
+
+/* Default parameter setup for compression */
+EXTERN(void) jpeg_set_defaults(j_compress_ptr cinfo);
+/* Compression parameter setup aids */
+EXTERN(void) jpeg_set_colorspace(j_compress_ptr cinfo,
+ J_COLOR_SPACE colorspace);
+EXTERN(void) jpeg_default_colorspace(j_compress_ptr cinfo);
+EXTERN(void) jpeg_set_quality(j_compress_ptr cinfo, int quality,
+ boolean force_baseline);
+EXTERN(void) jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+ boolean force_baseline);
+#if JPEG_LIB_VERSION >= 70
+EXTERN(void) jpeg_default_qtables(j_compress_ptr cinfo,
+ boolean force_baseline);
+#endif
+EXTERN(void) jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+ const unsigned int *basic_table,
+ int scale_factor, boolean force_baseline);
+EXTERN(int) jpeg_quality_scaling(int quality);
+EXTERN(void) jpeg_simple_progression(j_compress_ptr cinfo);
+EXTERN(void) jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress);
+EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table(j_common_ptr cinfo);
+EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table(j_common_ptr cinfo);
+
+/* Main entry points for compression */
+EXTERN(void) jpeg_start_compress(j_compress_ptr cinfo,
+ boolean write_all_tables);
+EXTERN(JDIMENSION) jpeg_write_scanlines(j_compress_ptr cinfo,
+ JSAMPARRAY scanlines,
+ JDIMENSION num_lines);
+EXTERN(void) jpeg_finish_compress(j_compress_ptr cinfo);
+
+#if JPEG_LIB_VERSION >= 70
+/* Precalculate JPEG dimensions for current compression parameters. */
+EXTERN(void) jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo);
+#endif
+
+/* Replaces jpeg_write_scanlines when writing raw downsampled data. */
+EXTERN(JDIMENSION) jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+ JDIMENSION num_lines);
+
+/* Write a special marker. See libjpeg.txt concerning safe usage. */
+EXTERN(void) jpeg_write_marker(j_compress_ptr cinfo, int marker,
+ const JOCTET *dataptr, unsigned int datalen);
+/* Same, but piecemeal. */
+EXTERN(void) jpeg_write_m_header(j_compress_ptr cinfo, int marker,
+ unsigned int datalen);
+EXTERN(void) jpeg_write_m_byte(j_compress_ptr cinfo, int val);
+
+/* Alternate compression function: just write an abbreviated table file */
+EXTERN(void) jpeg_write_tables(j_compress_ptr cinfo);
+
+/* Write ICC profile. See libjpeg.txt for usage information. */
+EXTERN(void) jpeg_write_icc_profile(j_compress_ptr cinfo,
+ const JOCTET *icc_data_ptr,
+ unsigned int icc_data_len);
+
+
+/* Decompression startup: read start of JPEG datastream to see what's there */
+EXTERN(int) jpeg_read_header(j_decompress_ptr cinfo, boolean require_image);
+/* Return value is one of: */
+#define JPEG_SUSPENDED 0 /* Suspended due to lack of input data */
+#define JPEG_HEADER_OK 1 /* Found valid image datastream */
+#define JPEG_HEADER_TABLES_ONLY 2 /* Found valid table-specs-only datastream */
+/* If you pass require_image = TRUE (normal case), you need not check for
+ * a TABLES_ONLY return code; an abbreviated file will cause an error exit.
+ * JPEG_SUSPENDED is only possible if you use a data source module that can
+ * give a suspension return (the stdio source module doesn't).
+ */
+
+/* Main entry points for decompression */
+EXTERN(boolean) jpeg_start_decompress(j_decompress_ptr cinfo);
+EXTERN(JDIMENSION) jpeg_read_scanlines(j_decompress_ptr cinfo,
+ JSAMPARRAY scanlines,
+ JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg_skip_scanlines(j_decompress_ptr cinfo,
+ JDIMENSION num_lines);
+EXTERN(void) jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+ JDIMENSION *width);
+EXTERN(boolean) jpeg_finish_decompress(j_decompress_ptr cinfo);
+
+/* Replaces jpeg_read_scanlines when reading raw downsampled data. */
+EXTERN(JDIMENSION) jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+ JDIMENSION max_lines);
+
+/* Additional entry points for buffered-image mode. */
+EXTERN(boolean) jpeg_has_multiple_scans(j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_start_output(j_decompress_ptr cinfo, int scan_number);
+EXTERN(boolean) jpeg_finish_output(j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_input_complete(j_decompress_ptr cinfo);
+EXTERN(void) jpeg_new_colormap(j_decompress_ptr cinfo);
+EXTERN(int) jpeg_consume_input(j_decompress_ptr cinfo);
+/* Return value is one of: */
+/* #define JPEG_SUSPENDED 0 Suspended due to lack of input data */
+#define JPEG_REACHED_SOS 1 /* Reached start of new scan */
+#define JPEG_REACHED_EOI 2 /* Reached end of image */
+#define JPEG_ROW_COMPLETED 3 /* Completed one iMCU row */
+#define JPEG_SCAN_COMPLETED 4 /* Completed last iMCU row of a scan */
+
+/* Precalculate output dimensions for current decompression parameters. */
+#if JPEG_LIB_VERSION >= 80
+EXTERN(void) jpeg_core_output_dimensions(j_decompress_ptr cinfo);
+#endif
+EXTERN(void) jpeg_calc_output_dimensions(j_decompress_ptr cinfo);
+
+/* Control saving of COM and APPn markers into marker_list. */
+EXTERN(void) jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+ unsigned int length_limit);
+
+/* Install a special processing method for COM or APPn markers. */
+EXTERN(void) jpeg_set_marker_processor(j_decompress_ptr cinfo,
+ int marker_code,
+ jpeg_marker_parser_method routine);
+
+/* Read or write raw DCT coefficients --- useful for lossless transcoding. */
+EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients(j_decompress_ptr cinfo);
+EXTERN(void) jpeg_write_coefficients(j_compress_ptr cinfo,
+ jvirt_barray_ptr *coef_arrays);
+EXTERN(void) jpeg_copy_critical_parameters(j_decompress_ptr srcinfo,
+ j_compress_ptr dstinfo);
+
+/* If you choose to abort compression or decompression before completing
+ * jpeg_finish_(de)compress, then you need to clean up to release memory,
+ * temporary files, etc. You can just call jpeg_destroy_(de)compress
+ * if you're done with the JPEG object, but if you want to clean it up and
+ * reuse it, call this:
+ */
+EXTERN(void) jpeg_abort_compress(j_compress_ptr cinfo);
+EXTERN(void) jpeg_abort_decompress(j_decompress_ptr cinfo);
+
+/* Generic versions of jpeg_abort and jpeg_destroy that work on either
+ * flavor of JPEG object. These may be more convenient in some places.
+ */
+EXTERN(void) jpeg_abort(j_common_ptr cinfo);
+EXTERN(void) jpeg_destroy(j_common_ptr cinfo);
+
+/* Default restart-marker-resync procedure for use by data source modules */
+EXTERN(boolean) jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired);
+
+/* Read ICC profile. See libjpeg.txt for usage information. */
+EXTERN(boolean) jpeg_read_icc_profile(j_decompress_ptr cinfo,
+ JOCTET **icc_data_ptr,
+ unsigned int *icc_data_len);
+
+
+/* These marker codes are exported since applications and data source modules
+ * are likely to want to use them.
+ */
+
+#define JPEG_RST0 0xD0 /* RST0 marker code */
+#define JPEG_EOI 0xD9 /* EOI marker code */
+#define JPEG_APP0 0xE0 /* APP0 marker code */
+#define JPEG_COM 0xFE /* COM marker code */
+
+
+/* If we have a brain-damaged compiler that emits warnings (or worse, errors)
+ * for structure definitions that are never filled in, keep it quiet by
+ * supplying dummy definitions for the various substructures.
+ */
+
+#ifdef INCOMPLETE_TYPES_BROKEN
+#ifndef JPEG_INTERNALS /* will be defined in jpegint.h */
+struct jvirt_sarray_control { long dummy; };
+struct jvirt_barray_control { long dummy; };
+struct jpeg_comp_master { long dummy; };
+struct jpeg_c_main_controller { long dummy; };
+struct jpeg_c_prep_controller { long dummy; };
+struct jpeg_c_coef_controller { long dummy; };
+struct jpeg_marker_writer { long dummy; };
+struct jpeg_color_converter { long dummy; };
+struct jpeg_downsampler { long dummy; };
+struct jpeg_forward_dct { long dummy; };
+struct jpeg_entropy_encoder { long dummy; };
+struct jpeg_decomp_master { long dummy; };
+struct jpeg_d_main_controller { long dummy; };
+struct jpeg_d_coef_controller { long dummy; };
+struct jpeg_d_post_controller { long dummy; };
+struct jpeg_input_controller { long dummy; };
+struct jpeg_marker_reader { long dummy; };
+struct jpeg_entropy_decoder { long dummy; };
+struct jpeg_inverse_dct { long dummy; };
+struct jpeg_upsampler { long dummy; };
+struct jpeg_color_deconverter { long dummy; };
+struct jpeg_color_quantizer { long dummy; };
+#endif /* JPEG_INTERNALS */
+#endif /* INCOMPLETE_TYPES_BROKEN */
+
+
+/*
+ * The JPEG library modules define JPEG_INTERNALS before including this file.
+ * The internal structure declarations are read only when that is true.
+ * Applications using the library should not include jpegint.h, but may wish
+ * to include jerror.h.
+ */
+
+#ifdef JPEG_INTERNALS
+#include "jpegint.h" /* fetch private declarations */
+#include "jerror.h" /* fetch error codes too */
+#endif
+
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+}
+#endif
+#endif
+
+#endif /* JPEGLIB_H */
diff --git a/media/libjpeg/jquant1.c b/media/libjpeg/jquant1.c
new file mode 100644
index 0000000000..73b83e16e5
--- /dev/null
+++ b/media/libjpeg/jquant1.c
@@ -0,0 +1,856 @@
+/*
+ * jquant1.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains 1-pass color quantization (color mapping) routines.
+ * These routines provide mapping to a fixed color map using equally spaced
+ * color values. Optional Floyd-Steinberg or ordered dithering is available.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+#ifdef QUANT_1PASS_SUPPORTED
+
+
+/*
+ * The main purpose of 1-pass quantization is to provide a fast, if not very
+ * high quality, colormapped output capability. A 2-pass quantizer usually
+ * gives better visual quality; however, for quantized grayscale output this
+ * quantizer is perfectly adequate. Dithering is highly recommended with this
+ * quantizer, though you can turn it off if you really want to.
+ *
+ * In 1-pass quantization the colormap must be chosen in advance of seeing the
+ * image. We use a map consisting of all combinations of Ncolors[i] color
+ * values for the i'th component. The Ncolors[] values are chosen so that
+ * their product, the total number of colors, is no more than that requested.
+ * (In most cases, the product will be somewhat less.)
+ *
+ * Since the colormap is orthogonal, the representative value for each color
+ * component can be determined without considering the other components;
+ * then these indexes can be combined into a colormap index by a standard
+ * N-dimensional-array-subscript calculation. Most of the arithmetic involved
+ * can be precalculated and stored in the lookup table colorindex[].
+ * colorindex[i][j] maps pixel value j in component i to the nearest
+ * representative value (grid plane) for that component; this index is
+ * multiplied by the array stride for component i, so that the
+ * index of the colormap entry closest to a given pixel value is just
+ * sum( colorindex[component-number][pixel-component-value] )
+ * Aside from being fast, this scheme allows for variable spacing between
+ * representative values with no additional lookup cost.
+ *
+ * If gamma correction has been applied in color conversion, it might be wise
+ * to adjust the color grid spacing so that the representative colors are
+ * equidistant in linear space. At this writing, gamma correction is not
+ * implemented by jdcolor, so nothing is done here.
+ */
+
+
+/* Declarations for ordered dithering.
+ *
+ * We use a standard 16x16 ordered dither array. The basic concept of ordered
+ * dithering is described in many references, for instance Dale Schumacher's
+ * chapter II.2 of Graphics Gems II (James Arvo, ed. Academic Press, 1991).
+ * In place of Schumacher's comparisons against a "threshold" value, we add a
+ * "dither" value to the input pixel and then round the result to the nearest
+ * output value. The dither value is equivalent to (0.5 - threshold) times
+ * the distance between output values. For ordered dithering, we assume that
+ * the output colors are equally spaced; if not, results will probably be
+ * worse, since the dither may be too much or too little at a given point.
+ *
+ * The normal calculation would be to form pixel value + dither, range-limit
+ * this to 0..MAXJSAMPLE, and then index into the colorindex table as usual.
+ * We can skip the separate range-limiting step by extending the colorindex
+ * table in both directions.
+ */
+
+#define ODITHER_SIZE 16 /* dimension of dither matrix */
+/* NB: if ODITHER_SIZE is not a power of 2, ODITHER_MASK uses will break */
+#define ODITHER_CELLS (ODITHER_SIZE * ODITHER_SIZE) /* # cells in matrix */
+#define ODITHER_MASK (ODITHER_SIZE - 1) /* mask for wrapping around
+ counters */
+
+typedef int ODITHER_MATRIX[ODITHER_SIZE][ODITHER_SIZE];
+typedef int (*ODITHER_MATRIX_PTR)[ODITHER_SIZE];
+
+static const UINT8 base_dither_matrix[ODITHER_SIZE][ODITHER_SIZE] = {
+ /* Bayer's order-4 dither array. Generated by the code given in
+ * Stephen Hawley's article "Ordered Dithering" in Graphics Gems I.
+ * The values in this array must range from 0 to ODITHER_CELLS-1.
+ */
+ { 0,192, 48,240, 12,204, 60,252, 3,195, 51,243, 15,207, 63,255 },
+ { 128, 64,176,112,140, 76,188,124,131, 67,179,115,143, 79,191,127 },
+ { 32,224, 16,208, 44,236, 28,220, 35,227, 19,211, 47,239, 31,223 },
+ { 160, 96,144, 80,172,108,156, 92,163, 99,147, 83,175,111,159, 95 },
+ { 8,200, 56,248, 4,196, 52,244, 11,203, 59,251, 7,199, 55,247 },
+ { 136, 72,184,120,132, 68,180,116,139, 75,187,123,135, 71,183,119 },
+ { 40,232, 24,216, 36,228, 20,212, 43,235, 27,219, 39,231, 23,215 },
+ { 168,104,152, 88,164,100,148, 84,171,107,155, 91,167,103,151, 87 },
+ { 2,194, 50,242, 14,206, 62,254, 1,193, 49,241, 13,205, 61,253 },
+ { 130, 66,178,114,142, 78,190,126,129, 65,177,113,141, 77,189,125 },
+ { 34,226, 18,210, 46,238, 30,222, 33,225, 17,209, 45,237, 29,221 },
+ { 162, 98,146, 82,174,110,158, 94,161, 97,145, 81,173,109,157, 93 },
+ { 10,202, 58,250, 6,198, 54,246, 9,201, 57,249, 5,197, 53,245 },
+ { 138, 74,186,122,134, 70,182,118,137, 73,185,121,133, 69,181,117 },
+ { 42,234, 26,218, 38,230, 22,214, 41,233, 25,217, 37,229, 21,213 },
+ { 170,106,154, 90,166,102,150, 86,169,105,153, 89,165,101,149, 85 }
+};
+
+
+/* Declarations for Floyd-Steinberg dithering.
+ *
+ * Errors are accumulated into the array fserrors[], at a resolution of
+ * 1/16th of a pixel count. The error at a given pixel is propagated
+ * to its not-yet-processed neighbors using the standard F-S fractions,
+ * ... (here) 7/16
+ * 3/16 5/16 1/16
+ * We work left-to-right on even rows, right-to-left on odd rows.
+ *
+ * We can get away with a single array (holding one row's worth of errors)
+ * by using it to store the current row's errors at pixel columns not yet
+ * processed, but the next row's errors at columns already processed. We
+ * need only a few extra variables to hold the errors immediately around the
+ * current column. (If we are lucky, those variables are in registers, but
+ * even if not, they're probably cheaper to access than array elements are.)
+ *
+ * The fserrors[] array is indexed [component#][position].
+ * We provide (#columns + 2) entries per component; the extra entry at each
+ * end saves us from special-casing the first and last pixels.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+typedef INT16 FSERROR; /* 16 bits should be enough */
+typedef int LOCFSERROR; /* use 'int' for calculation temps */
+#else
+typedef JLONG FSERROR; /* may need more than 16 bits */
+typedef JLONG LOCFSERROR; /* be sure calculation temps are big enough */
+#endif
+
+typedef FSERROR *FSERRPTR; /* pointer to error array */
+
+
+/* Private subobject */
+
+#define MAX_Q_COMPS 4 /* max components I can handle */
+
+typedef struct {
+ struct jpeg_color_quantizer pub; /* public fields */
+
+ /* Initially allocated colormap is saved here */
+ JSAMPARRAY sv_colormap; /* The color map as a 2-D pixel array */
+ int sv_actual; /* number of entries in use */
+
+ JSAMPARRAY colorindex; /* Precomputed mapping for speed */
+ /* colorindex[i][j] = index of color closest to pixel value j in component i,
+ * premultiplied as described above. Since colormap indexes must fit into
+ * JSAMPLEs, the entries of this array will too.
+ */
+ boolean is_padded; /* is the colorindex padded for odither? */
+
+ int Ncolors[MAX_Q_COMPS]; /* # of values allocated to each component */
+
+ /* Variables for ordered dithering */
+ int row_index; /* cur row's vertical index in dither matrix */
+ ODITHER_MATRIX_PTR odither[MAX_Q_COMPS]; /* one dither array per component */
+
+ /* Variables for Floyd-Steinberg dithering */
+ FSERRPTR fserrors[MAX_Q_COMPS]; /* accumulated errors */
+ boolean on_odd_row; /* flag to remember which row we are on */
+} my_cquantizer;
+
+typedef my_cquantizer *my_cquantize_ptr;
+
+
+/*
+ * Policy-making subroutines for create_colormap and create_colorindex.
+ * These routines determine the colormap to be used. The rest of the module
+ * only assumes that the colormap is orthogonal.
+ *
+ * * select_ncolors decides how to divvy up the available colors
+ * among the components.
+ * * output_value defines the set of representative values for a component.
+ * * largest_input_value defines the mapping from input values to
+ * representative values for a component.
+ * Note that the latter two routines may impose different policies for
+ * different components, though this is not currently done.
+ */
+
+
+LOCAL(int)
+select_ncolors(j_decompress_ptr cinfo, int Ncolors[])
+/* Determine allocation of desired colors to components, */
+/* and fill in Ncolors[] array to indicate choice. */
+/* Return value is total number of colors (product of Ncolors[] values). */
+{
+ int nc = cinfo->out_color_components; /* number of color components */
+ int max_colors = cinfo->desired_number_of_colors;
+ int total_colors, iroot, i, j;
+ boolean changed;
+ long temp;
+ int RGB_order[3] = { RGB_GREEN, RGB_RED, RGB_BLUE };
+ RGB_order[0] = rgb_green[cinfo->out_color_space];
+ RGB_order[1] = rgb_red[cinfo->out_color_space];
+ RGB_order[2] = rgb_blue[cinfo->out_color_space];
+
+ /* We can allocate at least the nc'th root of max_colors per component. */
+ /* Compute floor(nc'th root of max_colors). */
+ iroot = 1;
+ do {
+ iroot++;
+ temp = iroot; /* set temp = iroot ** nc */
+ for (i = 1; i < nc; i++)
+ temp *= iroot;
+ } while (temp <= (long)max_colors); /* repeat till iroot exceeds root */
+ iroot--; /* now iroot = floor(root) */
+
+ /* Must have at least 2 color values per component */
+ if (iroot < 2)
+ ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, (int)temp);
+
+ /* Initialize to iroot color values for each component */
+ total_colors = 1;
+ for (i = 0; i < nc; i++) {
+ Ncolors[i] = iroot;
+ total_colors *= iroot;
+ }
+ /* We may be able to increment the count for one or more components without
+ * exceeding max_colors, though we know not all can be incremented.
+ * Sometimes, the first component can be incremented more than once!
+ * (Example: for 16 colors, we start at 2*2*2, go to 3*2*2, then 4*2*2.)
+ * In RGB colorspace, try to increment G first, then R, then B.
+ */
+ do {
+ changed = FALSE;
+ for (i = 0; i < nc; i++) {
+ j = (cinfo->out_color_space == JCS_RGB ? RGB_order[i] : i);
+ /* calculate new total_colors if Ncolors[j] is incremented */
+ temp = total_colors / Ncolors[j];
+ temp *= Ncolors[j] + 1; /* done in long arith to avoid oflo */
+ if (temp > (long)max_colors)
+ break; /* won't fit, done with this pass */
+ Ncolors[j]++; /* OK, apply the increment */
+ total_colors = (int)temp;
+ changed = TRUE;
+ }
+ } while (changed);
+
+ return total_colors;
+}
+
+
+LOCAL(int)
+output_value(j_decompress_ptr cinfo, int ci, int j, int maxj)
+/* Return j'th output value, where j will range from 0 to maxj */
+/* The output values must fall in 0..MAXJSAMPLE in increasing order */
+{
+ /* We always provide values 0 and MAXJSAMPLE for each component;
+ * any additional values are equally spaced between these limits.
+ * (Forcing the upper and lower values to the limits ensures that
+ * dithering can't produce a color outside the selected gamut.)
+ */
+ return (int)(((JLONG)j * MAXJSAMPLE + maxj / 2) / maxj);
+}
+
+
+LOCAL(int)
+largest_input_value(j_decompress_ptr cinfo, int ci, int j, int maxj)
+/* Return largest input value that should map to j'th output value */
+/* Must have largest(j=0) >= 0, and largest(j=maxj) >= MAXJSAMPLE */
+{
+ /* Breakpoints are halfway between values returned by output_value */
+ return (int)(((JLONG)(2 * j + 1) * MAXJSAMPLE + maxj) / (2 * maxj));
+}
+
+
+/*
+ * Create the colormap.
+ */
+
+LOCAL(void)
+create_colormap(j_decompress_ptr cinfo)
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ JSAMPARRAY colormap; /* Created colormap */
+ int total_colors; /* Number of distinct output colors */
+ int i, j, k, nci, blksize, blkdist, ptr, val;
+
+ /* Select number of colors for each component */
+ total_colors = select_ncolors(cinfo, cquantize->Ncolors);
+
+ /* Report selected color counts */
+ if (cinfo->out_color_components == 3)
+ TRACEMS4(cinfo, 1, JTRC_QUANT_3_NCOLORS, total_colors,
+ cquantize->Ncolors[0], cquantize->Ncolors[1],
+ cquantize->Ncolors[2]);
+ else
+ TRACEMS1(cinfo, 1, JTRC_QUANT_NCOLORS, total_colors);
+
+ /* Allocate and fill in the colormap. */
+ /* The colors are ordered in the map in standard row-major order, */
+ /* i.e. rightmost (highest-indexed) color changes most rapidly. */
+
+ colormap = (*cinfo->mem->alloc_sarray)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (JDIMENSION)total_colors, (JDIMENSION)cinfo->out_color_components);
+
+ /* blksize is number of adjacent repeated entries for a component */
+ /* blkdist is distance between groups of identical entries for a component */
+ blkdist = total_colors;
+
+ for (i = 0; i < cinfo->out_color_components; i++) {
+ /* fill in colormap entries for i'th color component */
+ nci = cquantize->Ncolors[i]; /* # of distinct values for this color */
+ blksize = blkdist / nci;
+ for (j = 0; j < nci; j++) {
+ /* Compute j'th output value (out of nci) for component */
+ val = output_value(cinfo, i, j, nci - 1);
+ /* Fill in all colormap entries that have this value of this component */
+ for (ptr = j * blksize; ptr < total_colors; ptr += blkdist) {
+ /* fill in blksize entries beginning at ptr */
+ for (k = 0; k < blksize; k++)
+ colormap[i][ptr + k] = (JSAMPLE)val;
+ }
+ }
+ blkdist = blksize; /* blksize of this color is blkdist of next */
+ }
+
+ /* Save the colormap in private storage,
+ * where it will survive color quantization mode changes.
+ */
+ cquantize->sv_colormap = colormap;
+ cquantize->sv_actual = total_colors;
+}
+
+
+/*
+ * Create the color index table.
+ */
+
+LOCAL(void)
+create_colorindex(j_decompress_ptr cinfo)
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ JSAMPROW indexptr;
+ int i, j, k, nci, blksize, val, pad;
+
+ /* For ordered dither, we pad the color index tables by MAXJSAMPLE in
+ * each direction (input index values can be -MAXJSAMPLE .. 2*MAXJSAMPLE).
+ * This is not necessary in the other dithering modes. However, we
+ * flag whether it was done in case user changes dithering mode.
+ */
+ if (cinfo->dither_mode == JDITHER_ORDERED) {
+ pad = MAXJSAMPLE * 2;
+ cquantize->is_padded = TRUE;
+ } else {
+ pad = 0;
+ cquantize->is_padded = FALSE;
+ }
+
+ cquantize->colorindex = (*cinfo->mem->alloc_sarray)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (JDIMENSION)(MAXJSAMPLE + 1 + pad),
+ (JDIMENSION)cinfo->out_color_components);
+
+ /* blksize is number of adjacent repeated entries for a component */
+ blksize = cquantize->sv_actual;
+
+ for (i = 0; i < cinfo->out_color_components; i++) {
+ /* fill in colorindex entries for i'th color component */
+ nci = cquantize->Ncolors[i]; /* # of distinct values for this color */
+ blksize = blksize / nci;
+
+ /* adjust colorindex pointers to provide padding at negative indexes. */
+ if (pad)
+ cquantize->colorindex[i] += MAXJSAMPLE;
+
+ /* in loop, val = index of current output value, */
+ /* and k = largest j that maps to current val */
+ indexptr = cquantize->colorindex[i];
+ val = 0;
+ k = largest_input_value(cinfo, i, 0, nci - 1);
+ for (j = 0; j <= MAXJSAMPLE; j++) {
+ while (j > k) /* advance val if past boundary */
+ k = largest_input_value(cinfo, i, ++val, nci - 1);
+ /* premultiply so that no multiplication needed in main processing */
+ indexptr[j] = (JSAMPLE)(val * blksize);
+ }
+ /* Pad at both ends if necessary */
+ if (pad)
+ for (j = 1; j <= MAXJSAMPLE; j++) {
+ indexptr[-j] = indexptr[0];
+ indexptr[MAXJSAMPLE + j] = indexptr[MAXJSAMPLE];
+ }
+ }
+}
+
+
+/*
+ * Create an ordered-dither array for a component having ncolors
+ * distinct output values.
+ */
+
+LOCAL(ODITHER_MATRIX_PTR)
+make_odither_array(j_decompress_ptr cinfo, int ncolors)
+{
+ ODITHER_MATRIX_PTR odither;
+ int j, k;
+ JLONG num, den;
+
+ odither = (ODITHER_MATRIX_PTR)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(ODITHER_MATRIX));
+ /* The inter-value distance for this color is MAXJSAMPLE/(ncolors-1).
+ * Hence the dither value for the matrix cell with fill order f
+ * (f=0..N-1) should be (N-1-2*f)/(2*N) * MAXJSAMPLE/(ncolors-1).
+ * On 16-bit-int machine, be careful to avoid overflow.
+ */
+ den = 2 * ODITHER_CELLS * ((JLONG)(ncolors - 1));
+ for (j = 0; j < ODITHER_SIZE; j++) {
+ for (k = 0; k < ODITHER_SIZE; k++) {
+ num = ((JLONG)(ODITHER_CELLS - 1 -
+ 2 * ((int)base_dither_matrix[j][k]))) * MAXJSAMPLE;
+ /* Ensure round towards zero despite C's lack of consistency
+ * about rounding negative values in integer division...
+ */
+ odither[j][k] = (int)(num < 0 ? -((-num) / den) : num / den);
+ }
+ }
+ return odither;
+}
+
+
+/*
+ * Create the ordered-dither tables.
+ * Components having the same number of representative colors may
+ * share a dither table.
+ */
+
+LOCAL(void)
+create_odither_tables(j_decompress_ptr cinfo)
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ ODITHER_MATRIX_PTR odither;
+ int i, j, nci;
+
+ for (i = 0; i < cinfo->out_color_components; i++) {
+ nci = cquantize->Ncolors[i]; /* # of distinct values for this color */
+ odither = NULL; /* search for matching prior component */
+ for (j = 0; j < i; j++) {
+ if (nci == cquantize->Ncolors[j]) {
+ odither = cquantize->odither[j];
+ break;
+ }
+ }
+ if (odither == NULL) /* need a new table? */
+ odither = make_odither_array(cinfo, nci);
+ cquantize->odither[i] = odither;
+ }
+}
+
+
+/*
+ * Map some rows of pixels to the output colormapped representation.
+ */
+
+METHODDEF(void)
+color_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
+/* General case, no dithering */
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ JSAMPARRAY colorindex = cquantize->colorindex;
+ register int pixcode, ci;
+ register JSAMPROW ptrin, ptrout;
+ int row;
+ JDIMENSION col;
+ JDIMENSION width = cinfo->output_width;
+ register int nc = cinfo->out_color_components;
+
+ for (row = 0; row < num_rows; row++) {
+ ptrin = input_buf[row];
+ ptrout = output_buf[row];
+ for (col = width; col > 0; col--) {
+ pixcode = 0;
+ for (ci = 0; ci < nc; ci++) {
+ pixcode += colorindex[ci][*ptrin++];
+ }
+ *ptrout++ = (JSAMPLE)pixcode;
+ }
+ }
+}
+
+
+METHODDEF(void)
+color_quantize3(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
+/* Fast path for out_color_components==3, no dithering */
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ register int pixcode;
+ register JSAMPROW ptrin, ptrout;
+ JSAMPROW colorindex0 = cquantize->colorindex[0];
+ JSAMPROW colorindex1 = cquantize->colorindex[1];
+ JSAMPROW colorindex2 = cquantize->colorindex[2];
+ int row;
+ JDIMENSION col;
+ JDIMENSION width = cinfo->output_width;
+
+ for (row = 0; row < num_rows; row++) {
+ ptrin = input_buf[row];
+ ptrout = output_buf[row];
+ for (col = width; col > 0; col--) {
+ pixcode = colorindex0[*ptrin++];
+ pixcode += colorindex1[*ptrin++];
+ pixcode += colorindex2[*ptrin++];
+ *ptrout++ = (JSAMPLE)pixcode;
+ }
+ }
+}
+
+
+METHODDEF(void)
+quantize_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
+/* General case, with ordered dithering */
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ register JSAMPROW input_ptr;
+ register JSAMPROW output_ptr;
+ JSAMPROW colorindex_ci;
+ int *dither; /* points to active row of dither matrix */
+ int row_index, col_index; /* current indexes into dither matrix */
+ int nc = cinfo->out_color_components;
+ int ci;
+ int row;
+ JDIMENSION col;
+ JDIMENSION width = cinfo->output_width;
+
+ for (row = 0; row < num_rows; row++) {
+ /* Initialize output values to 0 so can process components separately */
+ jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
+ row_index = cquantize->row_index;
+ for (ci = 0; ci < nc; ci++) {
+ input_ptr = input_buf[row] + ci;
+ output_ptr = output_buf[row];
+ colorindex_ci = cquantize->colorindex[ci];
+ dither = cquantize->odither[ci][row_index];
+ col_index = 0;
+
+ for (col = width; col > 0; col--) {
+ /* Form pixel value + dither, range-limit to 0..MAXJSAMPLE,
+ * select output value, accumulate into output code for this pixel.
+ * Range-limiting need not be done explicitly, as we have extended
+ * the colorindex table to produce the right answers for out-of-range
+ * inputs. The maximum dither is +- MAXJSAMPLE; this sets the
+ * required amount of padding.
+ */
+ *output_ptr +=
+ colorindex_ci[*input_ptr + dither[col_index]];
+ input_ptr += nc;
+ output_ptr++;
+ col_index = (col_index + 1) & ODITHER_MASK;
+ }
+ }
+ /* Advance row index for next row */
+ row_index = (row_index + 1) & ODITHER_MASK;
+ cquantize->row_index = row_index;
+ }
+}
+
+
+METHODDEF(void)
+quantize3_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
+/* Fast path for out_color_components==3, with ordered dithering */
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ register int pixcode;
+ register JSAMPROW input_ptr;
+ register JSAMPROW output_ptr;
+ JSAMPROW colorindex0 = cquantize->colorindex[0];
+ JSAMPROW colorindex1 = cquantize->colorindex[1];
+ JSAMPROW colorindex2 = cquantize->colorindex[2];
+ int *dither0; /* points to active row of dither matrix */
+ int *dither1;
+ int *dither2;
+ int row_index, col_index; /* current indexes into dither matrix */
+ int row;
+ JDIMENSION col;
+ JDIMENSION width = cinfo->output_width;
+
+ for (row = 0; row < num_rows; row++) {
+ row_index = cquantize->row_index;
+ input_ptr = input_buf[row];
+ output_ptr = output_buf[row];
+ dither0 = cquantize->odither[0][row_index];
+ dither1 = cquantize->odither[1][row_index];
+ dither2 = cquantize->odither[2][row_index];
+ col_index = 0;
+
+ for (col = width; col > 0; col--) {
+ pixcode = colorindex0[(*input_ptr++) + dither0[col_index]];
+ pixcode += colorindex1[(*input_ptr++) + dither1[col_index]];
+ pixcode += colorindex2[(*input_ptr++) + dither2[col_index]];
+ *output_ptr++ = (JSAMPLE)pixcode;
+ col_index = (col_index + 1) & ODITHER_MASK;
+ }
+ row_index = (row_index + 1) & ODITHER_MASK;
+ cquantize->row_index = row_index;
+ }
+}
+
+
+METHODDEF(void)
+quantize_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
+/* General case, with Floyd-Steinberg dithering */
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ register LOCFSERROR cur; /* current error or pixel value */
+ LOCFSERROR belowerr; /* error for pixel below cur */
+ LOCFSERROR bpreverr; /* error for below/prev col */
+ LOCFSERROR bnexterr; /* error for below/next col */
+ LOCFSERROR delta;
+ register FSERRPTR errorptr; /* => fserrors[] at column before current */
+ register JSAMPROW input_ptr;
+ register JSAMPROW output_ptr;
+ JSAMPROW colorindex_ci;
+ JSAMPROW colormap_ci;
+ int pixcode;
+ int nc = cinfo->out_color_components;
+ int dir; /* 1 for left-to-right, -1 for right-to-left */
+ int dirnc; /* dir * nc */
+ int ci;
+ int row;
+ JDIMENSION col;
+ JDIMENSION width = cinfo->output_width;
+ JSAMPLE *range_limit = cinfo->sample_range_limit;
+ SHIFT_TEMPS
+
+ for (row = 0; row < num_rows; row++) {
+ /* Initialize output values to 0 so can process components separately */
+ jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
+ for (ci = 0; ci < nc; ci++) {
+ input_ptr = input_buf[row] + ci;
+ output_ptr = output_buf[row];
+ if (cquantize->on_odd_row) {
+ /* work right to left in this row */
+ input_ptr += (width - 1) * nc; /* so point to rightmost pixel */
+ output_ptr += width - 1;
+ dir = -1;
+ dirnc = -nc;
+ errorptr = cquantize->fserrors[ci] + (width + 1); /* => entry after last column */
+ } else {
+ /* work left to right in this row */
+ dir = 1;
+ dirnc = nc;
+ errorptr = cquantize->fserrors[ci]; /* => entry before first column */
+ }
+ colorindex_ci = cquantize->colorindex[ci];
+ colormap_ci = cquantize->sv_colormap[ci];
+ /* Preset error values: no error propagated to first pixel from left */
+ cur = 0;
+ /* and no error propagated to row below yet */
+ belowerr = bpreverr = 0;
+
+ for (col = width; col > 0; col--) {
+ /* cur holds the error propagated from the previous pixel on the
+ * current line. Add the error propagated from the previous line
+ * to form the complete error correction term for this pixel, and
+ * round the error term (which is expressed * 16) to an integer.
+ * RIGHT_SHIFT rounds towards minus infinity, so adding 8 is correct
+ * for either sign of the error value.
+ * Note: errorptr points to *previous* column's array entry.
+ */
+ cur = RIGHT_SHIFT(cur + errorptr[dir] + 8, 4);
+ /* Form pixel value + error, and range-limit to 0..MAXJSAMPLE.
+ * The maximum error is +- MAXJSAMPLE; this sets the required size
+ * of the range_limit array.
+ */
+ cur += *input_ptr;
+ cur = range_limit[cur];
+ /* Select output value, accumulate into output code for this pixel */
+ pixcode = colorindex_ci[cur];
+ *output_ptr += (JSAMPLE)pixcode;
+ /* Compute actual representation error at this pixel */
+ /* Note: we can do this even though we don't have the final */
+ /* pixel code, because the colormap is orthogonal. */
+ cur -= colormap_ci[pixcode];
+ /* Compute error fractions to be propagated to adjacent pixels.
+ * Add these into the running sums, and simultaneously shift the
+ * next-line error sums left by 1 column.
+ */
+ bnexterr = cur;
+ delta = cur * 2;
+ cur += delta; /* form error * 3 */
+ errorptr[0] = (FSERROR)(bpreverr + cur);
+ cur += delta; /* form error * 5 */
+ bpreverr = belowerr + cur;
+ belowerr = bnexterr;
+ cur += delta; /* form error * 7 */
+ /* At this point cur contains the 7/16 error value to be propagated
+ * to the next pixel on the current line, and all the errors for the
+ * next line have been shifted over. We are therefore ready to move on.
+ */
+ input_ptr += dirnc; /* advance input ptr to next column */
+ output_ptr += dir; /* advance output ptr to next column */
+ errorptr += dir; /* advance errorptr to current column */
+ }
+ /* Post-loop cleanup: we must unload the final error value into the
+ * final fserrors[] entry. Note we need not unload belowerr because
+ * it is for the dummy column before or after the actual array.
+ */
+ errorptr[0] = (FSERROR)bpreverr; /* unload prev err into array */
+ }
+ cquantize->on_odd_row = (cquantize->on_odd_row ? FALSE : TRUE);
+ }
+}
+
+
+/*
+ * Allocate workspace for Floyd-Steinberg errors.
+ */
+
+LOCAL(void)
+alloc_fs_workspace(j_decompress_ptr cinfo)
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ size_t arraysize;
+ int i;
+
+ arraysize = (size_t)((cinfo->output_width + 2) * sizeof(FSERROR));
+ for (i = 0; i < cinfo->out_color_components; i++) {
+ cquantize->fserrors[i] = (FSERRPTR)
+ (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE, arraysize);
+ }
+}
+
+
+/*
+ * Initialize for one-pass color quantization.
+ */
+
+METHODDEF(void)
+start_pass_1_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ size_t arraysize;
+ int i;
+
+ /* Install my colormap. */
+ cinfo->colormap = cquantize->sv_colormap;
+ cinfo->actual_number_of_colors = cquantize->sv_actual;
+
+ /* Initialize for desired dithering mode. */
+ switch (cinfo->dither_mode) {
+ case JDITHER_NONE:
+ if (cinfo->out_color_components == 3)
+ cquantize->pub.color_quantize = color_quantize3;
+ else
+ cquantize->pub.color_quantize = color_quantize;
+ break;
+ case JDITHER_ORDERED:
+ if (cinfo->out_color_components == 3)
+ cquantize->pub.color_quantize = quantize3_ord_dither;
+ else
+ cquantize->pub.color_quantize = quantize_ord_dither;
+ cquantize->row_index = 0; /* initialize state for ordered dither */
+ /* If user changed to ordered dither from another mode,
+ * we must recreate the color index table with padding.
+ * This will cost extra space, but probably isn't very likely.
+ */
+ if (!cquantize->is_padded)
+ create_colorindex(cinfo);
+ /* Create ordered-dither tables if we didn't already. */
+ if (cquantize->odither[0] == NULL)
+ create_odither_tables(cinfo);
+ break;
+ case JDITHER_FS:
+ cquantize->pub.color_quantize = quantize_fs_dither;
+ cquantize->on_odd_row = FALSE; /* initialize state for F-S dither */
+ /* Allocate Floyd-Steinberg workspace if didn't already. */
+ if (cquantize->fserrors[0] == NULL)
+ alloc_fs_workspace(cinfo);
+ /* Initialize the propagated errors to zero. */
+ arraysize = (size_t)((cinfo->output_width + 2) * sizeof(FSERROR));
+ for (i = 0; i < cinfo->out_color_components; i++)
+ jzero_far((void *)cquantize->fserrors[i], arraysize);
+ break;
+ default:
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+ break;
+ }
+}
+
+
+/*
+ * Finish up at the end of the pass.
+ */
+
+METHODDEF(void)
+finish_pass_1_quant(j_decompress_ptr cinfo)
+{
+ /* no work in 1-pass case */
+}
+
+
+/*
+ * Switch to a new external colormap between output passes.
+ * Shouldn't get to this module!
+ */
+
+METHODDEF(void)
+new_color_map_1_quant(j_decompress_ptr cinfo)
+{
+ ERREXIT(cinfo, JERR_MODE_CHANGE);
+}
+
+
+/*
+ * Module initialization routine for 1-pass color quantization.
+ */
+
+GLOBAL(void)
+jinit_1pass_quantizer(j_decompress_ptr cinfo)
+{
+ my_cquantize_ptr cquantize;
+
+ cquantize = (my_cquantize_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_cquantizer));
+ cinfo->cquantize = (struct jpeg_color_quantizer *)cquantize;
+ cquantize->pub.start_pass = start_pass_1_quant;
+ cquantize->pub.finish_pass = finish_pass_1_quant;
+ cquantize->pub.new_color_map = new_color_map_1_quant;
+ cquantize->fserrors[0] = NULL; /* Flag FS workspace not allocated */
+ cquantize->odither[0] = NULL; /* Also flag odither arrays not allocated */
+
+ /* Make sure my internal arrays won't overflow */
+ if (cinfo->out_color_components > MAX_Q_COMPS)
+ ERREXIT1(cinfo, JERR_QUANT_COMPONENTS, MAX_Q_COMPS);
+ /* Make sure colormap indexes can be represented by JSAMPLEs */
+ if (cinfo->desired_number_of_colors > (MAXJSAMPLE + 1))
+ ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXJSAMPLE + 1);
+
+ /* Create the colormap and color index table. */
+ create_colormap(cinfo);
+ create_colorindex(cinfo);
+
+ /* Allocate Floyd-Steinberg workspace now if requested.
+ * We do this now since it may affect the memory manager's space
+ * calculations. If the user changes to FS dither mode in a later pass, we
+ * will allocate the space then, and will possibly overrun the
+ * max_memory_to_use setting.
+ */
+ if (cinfo->dither_mode == JDITHER_FS)
+ alloc_fs_workspace(cinfo);
+}
+
+#endif /* QUANT_1PASS_SUPPORTED */
diff --git a/media/libjpeg/jquant2.c b/media/libjpeg/jquant2.c
new file mode 100644
index 0000000000..44efb18cad
--- /dev/null
+++ b/media/libjpeg/jquant2.c
@@ -0,0 +1,1285 @@
+/*
+ * jquant2.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2014-2015, 2020, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains 2-pass color quantization (color mapping) routines.
+ * These routines provide selection of a custom color map for an image,
+ * followed by mapping of the image to that color map, with optional
+ * Floyd-Steinberg dithering.
+ * It is also possible to use just the second pass to map to an arbitrary
+ * externally-given color map.
+ *
+ * Note: ordered dithering is not supported, since there isn't any fast
+ * way to compute intercolor distances; it's unclear that ordered dither's
+ * fundamental assumptions even hold with an irregularly spaced color map.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+#ifdef QUANT_2PASS_SUPPORTED
+
+
+/*
+ * This module implements the well-known Heckbert paradigm for color
+ * quantization. Most of the ideas used here can be traced back to
+ * Heckbert's seminal paper
+ * Heckbert, Paul. "Color Image Quantization for Frame Buffer Display",
+ * Proc. SIGGRAPH '82, Computer Graphics v.16 #3 (July 1982), pp 297-304.
+ *
+ * In the first pass over the image, we accumulate a histogram showing the
+ * usage count of each possible color. To keep the histogram to a reasonable
+ * size, we reduce the precision of the input; typical practice is to retain
+ * 5 or 6 bits per color, so that 8 or 4 different input values are counted
+ * in the same histogram cell.
+ *
+ * Next, the color-selection step begins with a box representing the whole
+ * color space, and repeatedly splits the "largest" remaining box until we
+ * have as many boxes as desired colors. Then the mean color in each
+ * remaining box becomes one of the possible output colors.
+ *
+ * The second pass over the image maps each input pixel to the closest output
+ * color (optionally after applying a Floyd-Steinberg dithering correction).
+ * This mapping is logically trivial, but making it go fast enough requires
+ * considerable care.
+ *
+ * Heckbert-style quantizers vary a good deal in their policies for choosing
+ * the "largest" box and deciding where to cut it. The particular policies
+ * used here have proved out well in experimental comparisons, but better ones
+ * may yet be found.
+ *
+ * In earlier versions of the IJG code, this module quantized in YCbCr color
+ * space, processing the raw upsampled data without a color conversion step.
+ * This allowed the color conversion math to be done only once per colormap
+ * entry, not once per pixel. However, that optimization precluded other
+ * useful optimizations (such as merging color conversion with upsampling)
+ * and it also interfered with desired capabilities such as quantizing to an
+ * externally-supplied colormap. We have therefore abandoned that approach.
+ * The present code works in the post-conversion color space, typically RGB.
+ *
+ * To improve the visual quality of the results, we actually work in scaled
+ * RGB space, giving G distances more weight than R, and R in turn more than
+ * B. To do everything in integer math, we must use integer scale factors.
+ * The 2/3/1 scale factors used here correspond loosely to the relative
+ * weights of the colors in the NTSC grayscale equation.
+ * If you want to use this code to quantize a non-RGB color space, you'll
+ * probably need to change these scale factors.
+ */
+
+#define R_SCALE 2 /* scale R distances by this much */
+#define G_SCALE 3 /* scale G distances by this much */
+#define B_SCALE 1 /* and B by this much */
+
+static const int c_scales[3] = { R_SCALE, G_SCALE, B_SCALE };
+#define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]]
+#define C1_SCALE c_scales[rgb_green[cinfo->out_color_space]]
+#define C2_SCALE c_scales[rgb_blue[cinfo->out_color_space]]
+
+/*
+ * First we have the histogram data structure and routines for creating it.
+ *
+ * The number of bits of precision can be adjusted by changing these symbols.
+ * We recommend keeping 6 bits for G and 5 each for R and B.
+ * If you have plenty of memory and cycles, 6 bits all around gives marginally
+ * better results; if you are short of memory, 5 bits all around will save
+ * some space but degrade the results.
+ * To maintain a fully accurate histogram, we'd need to allocate a "long"
+ * (preferably unsigned long) for each cell. In practice this is overkill;
+ * we can get by with 16 bits per cell. Few of the cell counts will overflow,
+ * and clamping those that do overflow to the maximum value will give close-
+ * enough results. This reduces the recommended histogram size from 256Kb
+ * to 128Kb, which is a useful savings on PC-class machines.
+ * (In the second pass the histogram space is re-used for pixel mapping data;
+ * in that capacity, each cell must be able to store zero to the number of
+ * desired colors. 16 bits/cell is plenty for that too.)
+ * Since the JPEG code is intended to run in small memory model on 80x86
+ * machines, we can't just allocate the histogram in one chunk. Instead
+ * of a true 3-D array, we use a row of pointers to 2-D arrays. Each
+ * pointer corresponds to a C0 value (typically 2^5 = 32 pointers) and
+ * each 2-D array has 2^6*2^5 = 2048 or 2^6*2^6 = 4096 entries.
+ */
+
+#define MAXNUMCOLORS (MAXJSAMPLE + 1) /* maximum size of colormap */
+
+/* These will do the right thing for either R,G,B or B,G,R color order,
+ * but you may not like the results for other color orders.
+ */
+#define HIST_C0_BITS 5 /* bits of precision in R/B histogram */
+#define HIST_C1_BITS 6 /* bits of precision in G histogram */
+#define HIST_C2_BITS 5 /* bits of precision in B/R histogram */
+
+/* Number of elements along histogram axes. */
+#define HIST_C0_ELEMS (1 << HIST_C0_BITS)
+#define HIST_C1_ELEMS (1 << HIST_C1_BITS)
+#define HIST_C2_ELEMS (1 << HIST_C2_BITS)
+
+/* These are the amounts to shift an input value to get a histogram index. */
+#define C0_SHIFT (BITS_IN_JSAMPLE - HIST_C0_BITS)
+#define C1_SHIFT (BITS_IN_JSAMPLE - HIST_C1_BITS)
+#define C2_SHIFT (BITS_IN_JSAMPLE - HIST_C2_BITS)
+
+
+typedef UINT16 histcell; /* histogram cell; prefer an unsigned type */
+
+typedef histcell *histptr; /* for pointers to histogram cells */
+
+typedef histcell hist1d[HIST_C2_ELEMS]; /* typedefs for the array */
+typedef hist1d *hist2d; /* type for the 2nd-level pointers */
+typedef hist2d *hist3d; /* type for top-level pointer */
+
+
+/* Declarations for Floyd-Steinberg dithering.
+ *
+ * Errors are accumulated into the array fserrors[], at a resolution of
+ * 1/16th of a pixel count. The error at a given pixel is propagated
+ * to its not-yet-processed neighbors using the standard F-S fractions,
+ * ... (here) 7/16
+ * 3/16 5/16 1/16
+ * We work left-to-right on even rows, right-to-left on odd rows.
+ *
+ * We can get away with a single array (holding one row's worth of errors)
+ * by using it to store the current row's errors at pixel columns not yet
+ * processed, but the next row's errors at columns already processed. We
+ * need only a few extra variables to hold the errors immediately around the
+ * current column. (If we are lucky, those variables are in registers, but
+ * even if not, they're probably cheaper to access than array elements are.)
+ *
+ * The fserrors[] array has (#columns + 2) entries; the extra entry at
+ * each end saves us from special-casing the first and last pixels.
+ * Each entry is three values long, one value for each color component.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+typedef INT16 FSERROR; /* 16 bits should be enough */
+typedef int LOCFSERROR; /* use 'int' for calculation temps */
+#else
+typedef JLONG FSERROR; /* may need more than 16 bits */
+typedef JLONG LOCFSERROR; /* be sure calculation temps are big enough */
+#endif
+
+typedef FSERROR *FSERRPTR; /* pointer to error array */
+
+
+/* Private subobject */
+
+typedef struct {
+ struct jpeg_color_quantizer pub; /* public fields */
+
+ /* Space for the eventually created colormap is stashed here */
+ JSAMPARRAY sv_colormap; /* colormap allocated at init time */
+ int desired; /* desired # of colors = size of colormap */
+
+ /* Variables for accumulating image statistics */
+ hist3d histogram; /* pointer to the histogram */
+
+ boolean needs_zeroed; /* TRUE if next pass must zero histogram */
+
+ /* Variables for Floyd-Steinberg dithering */
+ FSERRPTR fserrors; /* accumulated errors */
+ boolean on_odd_row; /* flag to remember which row we are on */
+ int *error_limiter; /* table for clamping the applied error */
+} my_cquantizer;
+
+typedef my_cquantizer *my_cquantize_ptr;
+
+
+/*
+ * Prescan some rows of pixels.
+ * In this module the prescan simply updates the histogram, which has been
+ * initialized to zeroes by start_pass.
+ * An output_buf parameter is required by the method signature, but no data
+ * is actually output (in fact the buffer controller is probably passing a
+ * NULL pointer).
+ */
+
+METHODDEF(void)
+prescan_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ register JSAMPROW ptr;
+ register histptr histp;
+ register hist3d histogram = cquantize->histogram;
+ int row;
+ JDIMENSION col;
+ JDIMENSION width = cinfo->output_width;
+
+ for (row = 0; row < num_rows; row++) {
+ ptr = input_buf[row];
+ for (col = width; col > 0; col--) {
+ /* get pixel value and index into the histogram */
+ histp = &histogram[ptr[0] >> C0_SHIFT]
+ [ptr[1] >> C1_SHIFT]
+ [ptr[2] >> C2_SHIFT];
+ /* increment, check for overflow and undo increment if so. */
+ if (++(*histp) <= 0)
+ (*histp)--;
+ ptr += 3;
+ }
+ }
+}
+
+
+/*
+ * Next we have the really interesting routines: selection of a colormap
+ * given the completed histogram.
+ * These routines work with a list of "boxes", each representing a rectangular
+ * subset of the input color space (to histogram precision).
+ */
+
+typedef struct {
+ /* The bounds of the box (inclusive); expressed as histogram indexes */
+ int c0min, c0max;
+ int c1min, c1max;
+ int c2min, c2max;
+ /* The volume (actually 2-norm) of the box */
+ JLONG volume;
+ /* The number of nonzero histogram cells within this box */
+ long colorcount;
+} box;
+
+typedef box *boxptr;
+
+
+LOCAL(boxptr)
+find_biggest_color_pop(boxptr boxlist, int numboxes)
+/* Find the splittable box with the largest color population */
+/* Returns NULL if no splittable boxes remain */
+{
+ register boxptr boxp;
+ register int i;
+ register long maxc = 0;
+ boxptr which = NULL;
+
+ for (i = 0, boxp = boxlist; i < numboxes; i++, boxp++) {
+ if (boxp->colorcount > maxc && boxp->volume > 0) {
+ which = boxp;
+ maxc = boxp->colorcount;
+ }
+ }
+ return which;
+}
+
+
+LOCAL(boxptr)
+find_biggest_volume(boxptr boxlist, int numboxes)
+/* Find the splittable box with the largest (scaled) volume */
+/* Returns NULL if no splittable boxes remain */
+{
+ register boxptr boxp;
+ register int i;
+ register JLONG maxv = 0;
+ boxptr which = NULL;
+
+ for (i = 0, boxp = boxlist; i < numboxes; i++, boxp++) {
+ if (boxp->volume > maxv) {
+ which = boxp;
+ maxv = boxp->volume;
+ }
+ }
+ return which;
+}
+
+
+LOCAL(void)
+update_box(j_decompress_ptr cinfo, boxptr boxp)
+/* Shrink the min/max bounds of a box to enclose only nonzero elements, */
+/* and recompute its volume and population */
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ hist3d histogram = cquantize->histogram;
+ histptr histp;
+ int c0, c1, c2;
+ int c0min, c0max, c1min, c1max, c2min, c2max;
+ JLONG dist0, dist1, dist2;
+ long ccount;
+
+ c0min = boxp->c0min; c0max = boxp->c0max;
+ c1min = boxp->c1min; c1max = boxp->c1max;
+ c2min = boxp->c2min; c2max = boxp->c2max;
+
+ if (c0max > c0min)
+ for (c0 = c0min; c0 <= c0max; c0++)
+ for (c1 = c1min; c1 <= c1max; c1++) {
+ histp = &histogram[c0][c1][c2min];
+ for (c2 = c2min; c2 <= c2max; c2++)
+ if (*histp++ != 0) {
+ boxp->c0min = c0min = c0;
+ goto have_c0min;
+ }
+ }
+have_c0min:
+ if (c0max > c0min)
+ for (c0 = c0max; c0 >= c0min; c0--)
+ for (c1 = c1min; c1 <= c1max; c1++) {
+ histp = &histogram[c0][c1][c2min];
+ for (c2 = c2min; c2 <= c2max; c2++)
+ if (*histp++ != 0) {
+ boxp->c0max = c0max = c0;
+ goto have_c0max;
+ }
+ }
+have_c0max:
+ if (c1max > c1min)
+ for (c1 = c1min; c1 <= c1max; c1++)
+ for (c0 = c0min; c0 <= c0max; c0++) {
+ histp = &histogram[c0][c1][c2min];
+ for (c2 = c2min; c2 <= c2max; c2++)
+ if (*histp++ != 0) {
+ boxp->c1min = c1min = c1;
+ goto have_c1min;
+ }
+ }
+have_c1min:
+ if (c1max > c1min)
+ for (c1 = c1max; c1 >= c1min; c1--)
+ for (c0 = c0min; c0 <= c0max; c0++) {
+ histp = &histogram[c0][c1][c2min];
+ for (c2 = c2min; c2 <= c2max; c2++)
+ if (*histp++ != 0) {
+ boxp->c1max = c1max = c1;
+ goto have_c1max;
+ }
+ }
+have_c1max:
+ if (c2max > c2min)
+ for (c2 = c2min; c2 <= c2max; c2++)
+ for (c0 = c0min; c0 <= c0max; c0++) {
+ histp = &histogram[c0][c1min][c2];
+ for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
+ if (*histp != 0) {
+ boxp->c2min = c2min = c2;
+ goto have_c2min;
+ }
+ }
+have_c2min:
+ if (c2max > c2min)
+ for (c2 = c2max; c2 >= c2min; c2--)
+ for (c0 = c0min; c0 <= c0max; c0++) {
+ histp = &histogram[c0][c1min][c2];
+ for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
+ if (*histp != 0) {
+ boxp->c2max = c2max = c2;
+ goto have_c2max;
+ }
+ }
+have_c2max:
+
+ /* Update box volume.
+ * We use 2-norm rather than real volume here; this biases the method
+ * against making long narrow boxes, and it has the side benefit that
+ * a box is splittable iff norm > 0.
+ * Since the differences are expressed in histogram-cell units,
+ * we have to shift back to JSAMPLE units to get consistent distances;
+ * after which, we scale according to the selected distance scale factors.
+ */
+ dist0 = ((c0max - c0min) << C0_SHIFT) * C0_SCALE;
+ dist1 = ((c1max - c1min) << C1_SHIFT) * C1_SCALE;
+ dist2 = ((c2max - c2min) << C2_SHIFT) * C2_SCALE;
+ boxp->volume = dist0 * dist0 + dist1 * dist1 + dist2 * dist2;
+
+ /* Now scan remaining volume of box and compute population */
+ ccount = 0;
+ for (c0 = c0min; c0 <= c0max; c0++)
+ for (c1 = c1min; c1 <= c1max; c1++) {
+ histp = &histogram[c0][c1][c2min];
+ for (c2 = c2min; c2 <= c2max; c2++, histp++)
+ if (*histp != 0) {
+ ccount++;
+ }
+ }
+ boxp->colorcount = ccount;
+}
+
+
+LOCAL(int)
+median_cut(j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
+ int desired_colors)
+/* Repeatedly select and split the largest box until we have enough boxes */
+{
+ int n, lb;
+ int c0, c1, c2, cmax;
+ register boxptr b1, b2;
+
+ while (numboxes < desired_colors) {
+ /* Select box to split.
+ * Current algorithm: by population for first half, then by volume.
+ */
+ if (numboxes * 2 <= desired_colors) {
+ b1 = find_biggest_color_pop(boxlist, numboxes);
+ } else {
+ b1 = find_biggest_volume(boxlist, numboxes);
+ }
+ if (b1 == NULL) /* no splittable boxes left! */
+ break;
+ b2 = &boxlist[numboxes]; /* where new box will go */
+ /* Copy the color bounds to the new box. */
+ b2->c0max = b1->c0max; b2->c1max = b1->c1max; b2->c2max = b1->c2max;
+ b2->c0min = b1->c0min; b2->c1min = b1->c1min; b2->c2min = b1->c2min;
+ /* Choose which axis to split the box on.
+ * Current algorithm: longest scaled axis.
+ * See notes in update_box about scaling distances.
+ */
+ c0 = ((b1->c0max - b1->c0min) << C0_SHIFT) * C0_SCALE;
+ c1 = ((b1->c1max - b1->c1min) << C1_SHIFT) * C1_SCALE;
+ c2 = ((b1->c2max - b1->c2min) << C2_SHIFT) * C2_SCALE;
+ /* We want to break any ties in favor of green, then red, blue last.
+ * This code does the right thing for R,G,B or B,G,R color orders only.
+ */
+ if (rgb_red[cinfo->out_color_space] == 0) {
+ cmax = c1; n = 1;
+ if (c0 > cmax) { cmax = c0; n = 0; }
+ if (c2 > cmax) { n = 2; }
+ } else {
+ cmax = c1; n = 1;
+ if (c2 > cmax) { cmax = c2; n = 2; }
+ if (c0 > cmax) { n = 0; }
+ }
+ /* Choose split point along selected axis, and update box bounds.
+ * Current algorithm: split at halfway point.
+ * (Since the box has been shrunk to minimum volume,
+ * any split will produce two nonempty subboxes.)
+ * Note that lb value is max for lower box, so must be < old max.
+ */
+ switch (n) {
+ case 0:
+ lb = (b1->c0max + b1->c0min) / 2;
+ b1->c0max = lb;
+ b2->c0min = lb + 1;
+ break;
+ case 1:
+ lb = (b1->c1max + b1->c1min) / 2;
+ b1->c1max = lb;
+ b2->c1min = lb + 1;
+ break;
+ case 2:
+ lb = (b1->c2max + b1->c2min) / 2;
+ b1->c2max = lb;
+ b2->c2min = lb + 1;
+ break;
+ }
+ /* Update stats for boxes */
+ update_box(cinfo, b1);
+ update_box(cinfo, b2);
+ numboxes++;
+ }
+ return numboxes;
+}
+
+
+LOCAL(void)
+compute_color(j_decompress_ptr cinfo, boxptr boxp, int icolor)
+/* Compute representative color for a box, put it in colormap[icolor] */
+{
+ /* Current algorithm: mean weighted by pixels (not colors) */
+ /* Note it is important to get the rounding correct! */
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ hist3d histogram = cquantize->histogram;
+ histptr histp;
+ int c0, c1, c2;
+ int c0min, c0max, c1min, c1max, c2min, c2max;
+ long count;
+ long total = 0;
+ long c0total = 0;
+ long c1total = 0;
+ long c2total = 0;
+
+ c0min = boxp->c0min; c0max = boxp->c0max;
+ c1min = boxp->c1min; c1max = boxp->c1max;
+ c2min = boxp->c2min; c2max = boxp->c2max;
+
+ for (c0 = c0min; c0 <= c0max; c0++)
+ for (c1 = c1min; c1 <= c1max; c1++) {
+ histp = &histogram[c0][c1][c2min];
+ for (c2 = c2min; c2 <= c2max; c2++) {
+ if ((count = *histp++) != 0) {
+ total += count;
+ c0total += ((c0 << C0_SHIFT) + ((1 << C0_SHIFT) >> 1)) * count;
+ c1total += ((c1 << C1_SHIFT) + ((1 << C1_SHIFT) >> 1)) * count;
+ c2total += ((c2 << C2_SHIFT) + ((1 << C2_SHIFT) >> 1)) * count;
+ }
+ }
+ }
+
+ cinfo->colormap[0][icolor] = (JSAMPLE)((c0total + (total >> 1)) / total);
+ cinfo->colormap[1][icolor] = (JSAMPLE)((c1total + (total >> 1)) / total);
+ cinfo->colormap[2][icolor] = (JSAMPLE)((c2total + (total >> 1)) / total);
+}
+
+
+LOCAL(void)
+select_colors(j_decompress_ptr cinfo, int desired_colors)
+/* Master routine for color selection */
+{
+ boxptr boxlist;
+ int numboxes;
+ int i;
+
+ /* Allocate workspace for box list */
+ boxlist = (boxptr)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, desired_colors * sizeof(box));
+ /* Initialize one box containing whole space */
+ numboxes = 1;
+ boxlist[0].c0min = 0;
+ boxlist[0].c0max = MAXJSAMPLE >> C0_SHIFT;
+ boxlist[0].c1min = 0;
+ boxlist[0].c1max = MAXJSAMPLE >> C1_SHIFT;
+ boxlist[0].c2min = 0;
+ boxlist[0].c2max = MAXJSAMPLE >> C2_SHIFT;
+ /* Shrink it to actually-used volume and set its statistics */
+ update_box(cinfo, &boxlist[0]);
+ /* Perform median-cut to produce final box list */
+ numboxes = median_cut(cinfo, boxlist, numboxes, desired_colors);
+ /* Compute the representative color for each box, fill colormap */
+ for (i = 0; i < numboxes; i++)
+ compute_color(cinfo, &boxlist[i], i);
+ cinfo->actual_number_of_colors = numboxes;
+ TRACEMS1(cinfo, 1, JTRC_QUANT_SELECTED, numboxes);
+}
+
+
+/*
+ * These routines are concerned with the time-critical task of mapping input
+ * colors to the nearest color in the selected colormap.
+ *
+ * We re-use the histogram space as an "inverse color map", essentially a
+ * cache for the results of nearest-color searches. All colors within a
+ * histogram cell will be mapped to the same colormap entry, namely the one
+ * closest to the cell's center. This may not be quite the closest entry to
+ * the actual input color, but it's almost as good. A zero in the cache
+ * indicates we haven't found the nearest color for that cell yet; the array
+ * is cleared to zeroes before starting the mapping pass. When we find the
+ * nearest color for a cell, its colormap index plus one is recorded in the
+ * cache for future use. The pass2 scanning routines call fill_inverse_cmap
+ * when they need to use an unfilled entry in the cache.
+ *
+ * Our method of efficiently finding nearest colors is based on the "locally
+ * sorted search" idea described by Heckbert and on the incremental distance
+ * calculation described by Spencer W. Thomas in chapter III.1 of Graphics
+ * Gems II (James Arvo, ed. Academic Press, 1991). Thomas points out that
+ * the distances from a given colormap entry to each cell of the histogram can
+ * be computed quickly using an incremental method: the differences between
+ * distances to adjacent cells themselves differ by a constant. This allows a
+ * fairly fast implementation of the "brute force" approach of computing the
+ * distance from every colormap entry to every histogram cell. Unfortunately,
+ * it needs a work array to hold the best-distance-so-far for each histogram
+ * cell (because the inner loop has to be over cells, not colormap entries).
+ * The work array elements have to be JLONGs, so the work array would need
+ * 256Kb at our recommended precision. This is not feasible in DOS machines.
+ *
+ * To get around these problems, we apply Thomas' method to compute the
+ * nearest colors for only the cells within a small subbox of the histogram.
+ * The work array need be only as big as the subbox, so the memory usage
+ * problem is solved. Furthermore, we need not fill subboxes that are never
+ * referenced in pass2; many images use only part of the color gamut, so a
+ * fair amount of work is saved. An additional advantage of this
+ * approach is that we can apply Heckbert's locality criterion to quickly
+ * eliminate colormap entries that are far away from the subbox; typically
+ * three-fourths of the colormap entries are rejected by Heckbert's criterion,
+ * and we need not compute their distances to individual cells in the subbox.
+ * The speed of this approach is heavily influenced by the subbox size: too
+ * small means too much overhead, too big loses because Heckbert's criterion
+ * can't eliminate as many colormap entries. Empirically the best subbox
+ * size seems to be about 1/512th of the histogram (1/8th in each direction).
+ *
+ * Thomas' article also describes a refined method which is asymptotically
+ * faster than the brute-force method, but it is also far more complex and
+ * cannot efficiently be applied to small subboxes. It is therefore not
+ * useful for programs intended to be portable to DOS machines. On machines
+ * with plenty of memory, filling the whole histogram in one shot with Thomas'
+ * refined method might be faster than the present code --- but then again,
+ * it might not be any faster, and it's certainly more complicated.
+ */
+
+
+/* log2(histogram cells in update box) for each axis; this can be adjusted */
+#define BOX_C0_LOG (HIST_C0_BITS - 3)
+#define BOX_C1_LOG (HIST_C1_BITS - 3)
+#define BOX_C2_LOG (HIST_C2_BITS - 3)
+
+#define BOX_C0_ELEMS (1 << BOX_C0_LOG) /* # of hist cells in update box */
+#define BOX_C1_ELEMS (1 << BOX_C1_LOG)
+#define BOX_C2_ELEMS (1 << BOX_C2_LOG)
+
+#define BOX_C0_SHIFT (C0_SHIFT + BOX_C0_LOG)
+#define BOX_C1_SHIFT (C1_SHIFT + BOX_C1_LOG)
+#define BOX_C2_SHIFT (C2_SHIFT + BOX_C2_LOG)
+
+
+/*
+ * The next three routines implement inverse colormap filling. They could
+ * all be folded into one big routine, but splitting them up this way saves
+ * some stack space (the mindist[] and bestdist[] arrays need not coexist)
+ * and may allow some compilers to produce better code by registerizing more
+ * inner-loop variables.
+ */
+
+LOCAL(int)
+find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
+ JSAMPLE colorlist[])
+/* Locate the colormap entries close enough to an update box to be candidates
+ * for the nearest entry to some cell(s) in the update box. The update box
+ * is specified by the center coordinates of its first cell. The number of
+ * candidate colormap entries is returned, and their colormap indexes are
+ * placed in colorlist[].
+ * This routine uses Heckbert's "locally sorted search" criterion to select
+ * the colors that need further consideration.
+ */
+{
+ int numcolors = cinfo->actual_number_of_colors;
+ int maxc0, maxc1, maxc2;
+ int centerc0, centerc1, centerc2;
+ int i, x, ncolors;
+ JLONG minmaxdist, min_dist, max_dist, tdist;
+ JLONG mindist[MAXNUMCOLORS]; /* min distance to colormap entry i */
+
+ /* Compute true coordinates of update box's upper corner and center.
+ * Actually we compute the coordinates of the center of the upper-corner
+ * histogram cell, which are the upper bounds of the volume we care about.
+ * Note that since ">>" rounds down, the "center" values may be closer to
+ * min than to max; hence comparisons to them must be "<=", not "<".
+ */
+ maxc0 = minc0 + ((1 << BOX_C0_SHIFT) - (1 << C0_SHIFT));
+ centerc0 = (minc0 + maxc0) >> 1;
+ maxc1 = minc1 + ((1 << BOX_C1_SHIFT) - (1 << C1_SHIFT));
+ centerc1 = (minc1 + maxc1) >> 1;
+ maxc2 = minc2 + ((1 << BOX_C2_SHIFT) - (1 << C2_SHIFT));
+ centerc2 = (minc2 + maxc2) >> 1;
+
+ /* For each color in colormap, find:
+ * 1. its minimum squared-distance to any point in the update box
+ * (zero if color is within update box);
+ * 2. its maximum squared-distance to any point in the update box.
+ * Both of these can be found by considering only the corners of the box.
+ * We save the minimum distance for each color in mindist[];
+ * only the smallest maximum distance is of interest.
+ */
+ minmaxdist = 0x7FFFFFFFL;
+
+ for (i = 0; i < numcolors; i++) {
+ /* We compute the squared-c0-distance term, then add in the other two. */
+ x = cinfo->colormap[0][i];
+ if (x < minc0) {
+ tdist = (x - minc0) * C0_SCALE;
+ min_dist = tdist * tdist;
+ tdist = (x - maxc0) * C0_SCALE;
+ max_dist = tdist * tdist;
+ } else if (x > maxc0) {
+ tdist = (x - maxc0) * C0_SCALE;
+ min_dist = tdist * tdist;
+ tdist = (x - minc0) * C0_SCALE;
+ max_dist = tdist * tdist;
+ } else {
+ /* within cell range so no contribution to min_dist */
+ min_dist = 0;
+ if (x <= centerc0) {
+ tdist = (x - maxc0) * C0_SCALE;
+ max_dist = tdist * tdist;
+ } else {
+ tdist = (x - minc0) * C0_SCALE;
+ max_dist = tdist * tdist;
+ }
+ }
+
+ x = cinfo->colormap[1][i];
+ if (x < minc1) {
+ tdist = (x - minc1) * C1_SCALE;
+ min_dist += tdist * tdist;
+ tdist = (x - maxc1) * C1_SCALE;
+ max_dist += tdist * tdist;
+ } else if (x > maxc1) {
+ tdist = (x - maxc1) * C1_SCALE;
+ min_dist += tdist * tdist;
+ tdist = (x - minc1) * C1_SCALE;
+ max_dist += tdist * tdist;
+ } else {
+ /* within cell range so no contribution to min_dist */
+ if (x <= centerc1) {
+ tdist = (x - maxc1) * C1_SCALE;
+ max_dist += tdist * tdist;
+ } else {
+ tdist = (x - minc1) * C1_SCALE;
+ max_dist += tdist * tdist;
+ }
+ }
+
+ x = cinfo->colormap[2][i];
+ if (x < minc2) {
+ tdist = (x - minc2) * C2_SCALE;
+ min_dist += tdist * tdist;
+ tdist = (x - maxc2) * C2_SCALE;
+ max_dist += tdist * tdist;
+ } else if (x > maxc2) {
+ tdist = (x - maxc2) * C2_SCALE;
+ min_dist += tdist * tdist;
+ tdist = (x - minc2) * C2_SCALE;
+ max_dist += tdist * tdist;
+ } else {
+ /* within cell range so no contribution to min_dist */
+ if (x <= centerc2) {
+ tdist = (x - maxc2) * C2_SCALE;
+ max_dist += tdist * tdist;
+ } else {
+ tdist = (x - minc2) * C2_SCALE;
+ max_dist += tdist * tdist;
+ }
+ }
+
+ mindist[i] = min_dist; /* save away the results */
+ if (max_dist < minmaxdist)
+ minmaxdist = max_dist;
+ }
+
+ /* Now we know that no cell in the update box is more than minmaxdist
+ * away from some colormap entry. Therefore, only colors that are
+ * within minmaxdist of some part of the box need be considered.
+ */
+ ncolors = 0;
+ for (i = 0; i < numcolors; i++) {
+ if (mindist[i] <= minmaxdist)
+ colorlist[ncolors++] = (JSAMPLE)i;
+ }
+ return ncolors;
+}
+
+
+LOCAL(void)
+find_best_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
+ int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[])
+/* Find the closest colormap entry for each cell in the update box,
+ * given the list of candidate colors prepared by find_nearby_colors.
+ * Return the indexes of the closest entries in the bestcolor[] array.
+ * This routine uses Thomas' incremental distance calculation method to
+ * find the distance from a colormap entry to successive cells in the box.
+ */
+{
+ int ic0, ic1, ic2;
+ int i, icolor;
+ register JLONG *bptr; /* pointer into bestdist[] array */
+ JSAMPLE *cptr; /* pointer into bestcolor[] array */
+ JLONG dist0, dist1; /* initial distance values */
+ register JLONG dist2; /* current distance in inner loop */
+ JLONG xx0, xx1; /* distance increments */
+ register JLONG xx2;
+ JLONG inc0, inc1, inc2; /* initial values for increments */
+ /* This array holds the distance to the nearest-so-far color for each cell */
+ JLONG bestdist[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS];
+
+ /* Initialize best-distance for each cell of the update box */
+ bptr = bestdist;
+ for (i = BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS - 1; i >= 0; i--)
+ *bptr++ = 0x7FFFFFFFL;
+
+ /* For each color selected by find_nearby_colors,
+ * compute its distance to the center of each cell in the box.
+ * If that's less than best-so-far, update best distance and color number.
+ */
+
+ /* Nominal steps between cell centers ("x" in Thomas article) */
+#define STEP_C0 ((1 << C0_SHIFT) * C0_SCALE)
+#define STEP_C1 ((1 << C1_SHIFT) * C1_SCALE)
+#define STEP_C2 ((1 << C2_SHIFT) * C2_SCALE)
+
+ for (i = 0; i < numcolors; i++) {
+ icolor = colorlist[i];
+ /* Compute (square of) distance from minc0/c1/c2 to this color */
+ inc0 = (minc0 - cinfo->colormap[0][icolor]) * C0_SCALE;
+ dist0 = inc0 * inc0;
+ inc1 = (minc1 - cinfo->colormap[1][icolor]) * C1_SCALE;
+ dist0 += inc1 * inc1;
+ inc2 = (minc2 - cinfo->colormap[2][icolor]) * C2_SCALE;
+ dist0 += inc2 * inc2;
+ /* Form the initial difference increments */
+ inc0 = inc0 * (2 * STEP_C0) + STEP_C0 * STEP_C0;
+ inc1 = inc1 * (2 * STEP_C1) + STEP_C1 * STEP_C1;
+ inc2 = inc2 * (2 * STEP_C2) + STEP_C2 * STEP_C2;
+ /* Now loop over all cells in box, updating distance per Thomas method */
+ bptr = bestdist;
+ cptr = bestcolor;
+ xx0 = inc0;
+ for (ic0 = BOX_C0_ELEMS - 1; ic0 >= 0; ic0--) {
+ dist1 = dist0;
+ xx1 = inc1;
+ for (ic1 = BOX_C1_ELEMS - 1; ic1 >= 0; ic1--) {
+ dist2 = dist1;
+ xx2 = inc2;
+ for (ic2 = BOX_C2_ELEMS - 1; ic2 >= 0; ic2--) {
+ if (dist2 < *bptr) {
+ *bptr = dist2;
+ *cptr = (JSAMPLE)icolor;
+ }
+ dist2 += xx2;
+ xx2 += 2 * STEP_C2 * STEP_C2;
+ bptr++;
+ cptr++;
+ }
+ dist1 += xx1;
+ xx1 += 2 * STEP_C1 * STEP_C1;
+ }
+ dist0 += xx0;
+ xx0 += 2 * STEP_C0 * STEP_C0;
+ }
+ }
+}
+
+
+LOCAL(void)
+fill_inverse_cmap(j_decompress_ptr cinfo, int c0, int c1, int c2)
+/* Fill the inverse-colormap entries in the update box that contains */
+/* histogram cell c0/c1/c2. (Only that one cell MUST be filled, but */
+/* we can fill as many others as we wish.) */
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ hist3d histogram = cquantize->histogram;
+ int minc0, minc1, minc2; /* lower left corner of update box */
+ int ic0, ic1, ic2;
+ register JSAMPLE *cptr; /* pointer into bestcolor[] array */
+ register histptr cachep; /* pointer into main cache array */
+ /* This array lists the candidate colormap indexes. */
+ JSAMPLE colorlist[MAXNUMCOLORS];
+ int numcolors; /* number of candidate colors */
+ /* This array holds the actually closest colormap index for each cell. */
+ JSAMPLE bestcolor[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS];
+
+ /* Convert cell coordinates to update box ID */
+ c0 >>= BOX_C0_LOG;
+ c1 >>= BOX_C1_LOG;
+ c2 >>= BOX_C2_LOG;
+
+ /* Compute true coordinates of update box's origin corner.
+ * Actually we compute the coordinates of the center of the corner
+ * histogram cell, which are the lower bounds of the volume we care about.
+ */
+ minc0 = (c0 << BOX_C0_SHIFT) + ((1 << C0_SHIFT) >> 1);
+ minc1 = (c1 << BOX_C1_SHIFT) + ((1 << C1_SHIFT) >> 1);
+ minc2 = (c2 << BOX_C2_SHIFT) + ((1 << C2_SHIFT) >> 1);
+
+ /* Determine which colormap entries are close enough to be candidates
+ * for the nearest entry to some cell in the update box.
+ */
+ numcolors = find_nearby_colors(cinfo, minc0, minc1, minc2, colorlist);
+
+ /* Determine the actually nearest colors. */
+ find_best_colors(cinfo, minc0, minc1, minc2, numcolors, colorlist,
+ bestcolor);
+
+ /* Save the best color numbers (plus 1) in the main cache array */
+ c0 <<= BOX_C0_LOG; /* convert ID back to base cell indexes */
+ c1 <<= BOX_C1_LOG;
+ c2 <<= BOX_C2_LOG;
+ cptr = bestcolor;
+ for (ic0 = 0; ic0 < BOX_C0_ELEMS; ic0++) {
+ for (ic1 = 0; ic1 < BOX_C1_ELEMS; ic1++) {
+ cachep = &histogram[c0 + ic0][c1 + ic1][c2];
+ for (ic2 = 0; ic2 < BOX_C2_ELEMS; ic2++) {
+ *cachep++ = (histcell)((*cptr++) + 1);
+ }
+ }
+ }
+}
+
+
+/*
+ * Map some rows of pixels to the output colormapped representation.
+ */
+
+METHODDEF(void)
+pass2_no_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
+/* This version performs no dithering */
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ hist3d histogram = cquantize->histogram;
+ register JSAMPROW inptr, outptr;
+ register histptr cachep;
+ register int c0, c1, c2;
+ int row;
+ JDIMENSION col;
+ JDIMENSION width = cinfo->output_width;
+
+ for (row = 0; row < num_rows; row++) {
+ inptr = input_buf[row];
+ outptr = output_buf[row];
+ for (col = width; col > 0; col--) {
+ /* get pixel value and index into the cache */
+ c0 = (*inptr++) >> C0_SHIFT;
+ c1 = (*inptr++) >> C1_SHIFT;
+ c2 = (*inptr++) >> C2_SHIFT;
+ cachep = &histogram[c0][c1][c2];
+ /* If we have not seen this color before, find nearest colormap entry */
+ /* and update the cache */
+ if (*cachep == 0)
+ fill_inverse_cmap(cinfo, c0, c1, c2);
+ /* Now emit the colormap index for this cell */
+ *outptr++ = (JSAMPLE)(*cachep - 1);
+ }
+ }
+}
+
+
+METHODDEF(void)
+pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPARRAY output_buf, int num_rows)
+/* This version performs Floyd-Steinberg dithering */
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ hist3d histogram = cquantize->histogram;
+ register LOCFSERROR cur0, cur1, cur2; /* current error or pixel value */
+ LOCFSERROR belowerr0, belowerr1, belowerr2; /* error for pixel below cur */
+ LOCFSERROR bpreverr0, bpreverr1, bpreverr2; /* error for below/prev col */
+ register FSERRPTR errorptr; /* => fserrors[] at column before current */
+ JSAMPROW inptr; /* => current input pixel */
+ JSAMPROW outptr; /* => current output pixel */
+ histptr cachep;
+ int dir; /* +1 or -1 depending on direction */
+ int dir3; /* 3*dir, for advancing inptr & errorptr */
+ int row;
+ JDIMENSION col;
+ JDIMENSION width = cinfo->output_width;
+ JSAMPLE *range_limit = cinfo->sample_range_limit;
+ int *error_limit = cquantize->error_limiter;
+ JSAMPROW colormap0 = cinfo->colormap[0];
+ JSAMPROW colormap1 = cinfo->colormap[1];
+ JSAMPROW colormap2 = cinfo->colormap[2];
+ SHIFT_TEMPS
+
+ for (row = 0; row < num_rows; row++) {
+ inptr = input_buf[row];
+ outptr = output_buf[row];
+ if (cquantize->on_odd_row) {
+ /* work right to left in this row */
+ inptr += (width - 1) * 3; /* so point to rightmost pixel */
+ outptr += width - 1;
+ dir = -1;
+ dir3 = -3;
+ errorptr = cquantize->fserrors + (width + 1) * 3; /* => entry after last column */
+ cquantize->on_odd_row = FALSE; /* flip for next time */
+ } else {
+ /* work left to right in this row */
+ dir = 1;
+ dir3 = 3;
+ errorptr = cquantize->fserrors; /* => entry before first real column */
+ cquantize->on_odd_row = TRUE; /* flip for next time */
+ }
+ /* Preset error values: no error propagated to first pixel from left */
+ cur0 = cur1 = cur2 = 0;
+ /* and no error propagated to row below yet */
+ belowerr0 = belowerr1 = belowerr2 = 0;
+ bpreverr0 = bpreverr1 = bpreverr2 = 0;
+
+ for (col = width; col > 0; col--) {
+ /* curN holds the error propagated from the previous pixel on the
+ * current line. Add the error propagated from the previous line
+ * to form the complete error correction term for this pixel, and
+ * round the error term (which is expressed * 16) to an integer.
+ * RIGHT_SHIFT rounds towards minus infinity, so adding 8 is correct
+ * for either sign of the error value.
+ * Note: errorptr points to *previous* column's array entry.
+ */
+ cur0 = RIGHT_SHIFT(cur0 + errorptr[dir3 + 0] + 8, 4);
+ cur1 = RIGHT_SHIFT(cur1 + errorptr[dir3 + 1] + 8, 4);
+ cur2 = RIGHT_SHIFT(cur2 + errorptr[dir3 + 2] + 8, 4);
+ /* Limit the error using transfer function set by init_error_limit.
+ * See comments with init_error_limit for rationale.
+ */
+ cur0 = error_limit[cur0];
+ cur1 = error_limit[cur1];
+ cur2 = error_limit[cur2];
+ /* Form pixel value + error, and range-limit to 0..MAXJSAMPLE.
+ * The maximum error is +- MAXJSAMPLE (or less with error limiting);
+ * this sets the required size of the range_limit array.
+ */
+ cur0 += inptr[0];
+ cur1 += inptr[1];
+ cur2 += inptr[2];
+ cur0 = range_limit[cur0];
+ cur1 = range_limit[cur1];
+ cur2 = range_limit[cur2];
+ /* Index into the cache with adjusted pixel value */
+ cachep =
+ &histogram[cur0 >> C0_SHIFT][cur1 >> C1_SHIFT][cur2 >> C2_SHIFT];
+ /* If we have not seen this color before, find nearest colormap */
+ /* entry and update the cache */
+ if (*cachep == 0)
+ fill_inverse_cmap(cinfo, cur0 >> C0_SHIFT, cur1 >> C1_SHIFT,
+ cur2 >> C2_SHIFT);
+ /* Now emit the colormap index for this cell */
+ {
+ register int pixcode = *cachep - 1;
+ *outptr = (JSAMPLE)pixcode;
+ /* Compute representation error for this pixel */
+ cur0 -= colormap0[pixcode];
+ cur1 -= colormap1[pixcode];
+ cur2 -= colormap2[pixcode];
+ }
+ /* Compute error fractions to be propagated to adjacent pixels.
+ * Add these into the running sums, and simultaneously shift the
+ * next-line error sums left by 1 column.
+ */
+ {
+ register LOCFSERROR bnexterr;
+
+ bnexterr = cur0; /* Process component 0 */
+ errorptr[0] = (FSERROR)(bpreverr0 + cur0 * 3);
+ bpreverr0 = belowerr0 + cur0 * 5;
+ belowerr0 = bnexterr;
+ cur0 *= 7;
+ bnexterr = cur1; /* Process component 1 */
+ errorptr[1] = (FSERROR)(bpreverr1 + cur1 * 3);
+ bpreverr1 = belowerr1 + cur1 * 5;
+ belowerr1 = bnexterr;
+ cur1 *= 7;
+ bnexterr = cur2; /* Process component 2 */
+ errorptr[2] = (FSERROR)(bpreverr2 + cur2 * 3);
+ bpreverr2 = belowerr2 + cur2 * 5;
+ belowerr2 = bnexterr;
+ cur2 *= 7;
+ }
+ /* At this point curN contains the 7/16 error value to be propagated
+ * to the next pixel on the current line, and all the errors for the
+ * next line have been shifted over. We are therefore ready to move on.
+ */
+ inptr += dir3; /* Advance pixel pointers to next column */
+ outptr += dir;
+ errorptr += dir3; /* advance errorptr to current column */
+ }
+ /* Post-loop cleanup: we must unload the final error values into the
+ * final fserrors[] entry. Note we need not unload belowerrN because
+ * it is for the dummy column before or after the actual array.
+ */
+ errorptr[0] = (FSERROR)bpreverr0; /* unload prev errs into array */
+ errorptr[1] = (FSERROR)bpreverr1;
+ errorptr[2] = (FSERROR)bpreverr2;
+ }
+}
+
+
+/*
+ * Initialize the error-limiting transfer function (lookup table).
+ * The raw F-S error computation can potentially compute error values of up to
+ * +- MAXJSAMPLE. But we want the maximum correction applied to a pixel to be
+ * much less, otherwise obviously wrong pixels will be created. (Typical
+ * effects include weird fringes at color-area boundaries, isolated bright
+ * pixels in a dark area, etc.) The standard advice for avoiding this problem
+ * is to ensure that the "corners" of the color cube are allocated as output
+ * colors; then repeated errors in the same direction cannot cause cascading
+ * error buildup. However, that only prevents the error from getting
+ * completely out of hand; Aaron Giles reports that error limiting improves
+ * the results even with corner colors allocated.
+ * A simple clamping of the error values to about +- MAXJSAMPLE/8 works pretty
+ * well, but the smoother transfer function used below is even better. Thanks
+ * to Aaron Giles for this idea.
+ */
+
+LOCAL(void)
+init_error_limit(j_decompress_ptr cinfo)
+/* Allocate and fill in the error_limiter table */
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ int *table;
+ int in, out;
+
+ table = (int *)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, (MAXJSAMPLE * 2 + 1) * sizeof(int));
+ table += MAXJSAMPLE; /* so can index -MAXJSAMPLE .. +MAXJSAMPLE */
+ cquantize->error_limiter = table;
+
+#define STEPSIZE ((MAXJSAMPLE + 1) / 16)
+ /* Map errors 1:1 up to +- MAXJSAMPLE/16 */
+ out = 0;
+ for (in = 0; in < STEPSIZE; in++, out++) {
+ table[in] = out; table[-in] = -out;
+ }
+ /* Map errors 1:2 up to +- 3*MAXJSAMPLE/16 */
+ for (; in < STEPSIZE * 3; in++, out += (in & 1) ? 0 : 1) {
+ table[in] = out; table[-in] = -out;
+ }
+ /* Clamp the rest to final out value (which is (MAXJSAMPLE+1)/8) */
+ for (; in <= MAXJSAMPLE; in++) {
+ table[in] = out; table[-in] = -out;
+ }
+#undef STEPSIZE
+}
+
+
+/*
+ * Finish up at the end of each pass.
+ */
+
+METHODDEF(void)
+finish_pass1(j_decompress_ptr cinfo)
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+
+ /* Select the representative colors and fill in cinfo->colormap */
+ cinfo->colormap = cquantize->sv_colormap;
+ select_colors(cinfo, cquantize->desired);
+ /* Force next pass to zero the color index table */
+ cquantize->needs_zeroed = TRUE;
+}
+
+
+METHODDEF(void)
+finish_pass2(j_decompress_ptr cinfo)
+{
+ /* no work */
+}
+
+
+/*
+ * Initialize for each processing pass.
+ */
+
+METHODDEF(void)
+start_pass_2_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+ hist3d histogram = cquantize->histogram;
+ int i;
+
+ /* Only F-S dithering or no dithering is supported. */
+ /* If user asks for ordered dither, give them F-S. */
+ if (cinfo->dither_mode != JDITHER_NONE)
+ cinfo->dither_mode = JDITHER_FS;
+
+ if (is_pre_scan) {
+ /* Set up method pointers */
+ cquantize->pub.color_quantize = prescan_quantize;
+ cquantize->pub.finish_pass = finish_pass1;
+ cquantize->needs_zeroed = TRUE; /* Always zero histogram */
+ } else {
+ /* Set up method pointers */
+ if (cinfo->dither_mode == JDITHER_FS)
+ cquantize->pub.color_quantize = pass2_fs_dither;
+ else
+ cquantize->pub.color_quantize = pass2_no_dither;
+ cquantize->pub.finish_pass = finish_pass2;
+
+ /* Make sure color count is acceptable */
+ i = cinfo->actual_number_of_colors;
+ if (i < 1)
+ ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, 1);
+ if (i > MAXNUMCOLORS)
+ ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
+
+ if (cinfo->dither_mode == JDITHER_FS) {
+ size_t arraysize =
+ (size_t)((cinfo->output_width + 2) * (3 * sizeof(FSERROR)));
+ /* Allocate Floyd-Steinberg workspace if we didn't already. */
+ if (cquantize->fserrors == NULL)
+ cquantize->fserrors = (FSERRPTR)(*cinfo->mem->alloc_large)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, arraysize);
+ /* Initialize the propagated errors to zero. */
+ jzero_far((void *)cquantize->fserrors, arraysize);
+ /* Make the error-limit table if we didn't already. */
+ if (cquantize->error_limiter == NULL)
+ init_error_limit(cinfo);
+ cquantize->on_odd_row = FALSE;
+ }
+
+ }
+ /* Zero the histogram or inverse color map, if necessary */
+ if (cquantize->needs_zeroed) {
+ for (i = 0; i < HIST_C0_ELEMS; i++) {
+ jzero_far((void *)histogram[i],
+ HIST_C1_ELEMS * HIST_C2_ELEMS * sizeof(histcell));
+ }
+ cquantize->needs_zeroed = FALSE;
+ }
+}
+
+
+/*
+ * Switch to a new external colormap between output passes.
+ */
+
+METHODDEF(void)
+new_color_map_2_quant(j_decompress_ptr cinfo)
+{
+ my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
+
+ /* Reset the inverse color map */
+ cquantize->needs_zeroed = TRUE;
+}
+
+
+/*
+ * Module initialization routine for 2-pass color quantization.
+ */
+
+GLOBAL(void)
+jinit_2pass_quantizer(j_decompress_ptr cinfo)
+{
+ my_cquantize_ptr cquantize;
+ int i;
+
+ cquantize = (my_cquantize_ptr)
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ sizeof(my_cquantizer));
+ cinfo->cquantize = (struct jpeg_color_quantizer *)cquantize;
+ cquantize->pub.start_pass = start_pass_2_quant;
+ cquantize->pub.new_color_map = new_color_map_2_quant;
+ cquantize->fserrors = NULL; /* flag optional arrays not allocated */
+ cquantize->error_limiter = NULL;
+
+ /* Make sure jdmaster didn't give me a case I can't handle */
+ if (cinfo->out_color_components != 3)
+ ERREXIT(cinfo, JERR_NOTIMPL);
+
+ /* Allocate the histogram/inverse colormap storage */
+ cquantize->histogram = (hist3d)(*cinfo->mem->alloc_small)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, HIST_C0_ELEMS * sizeof(hist2d));
+ for (i = 0; i < HIST_C0_ELEMS; i++) {
+ cquantize->histogram[i] = (hist2d)(*cinfo->mem->alloc_large)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ HIST_C1_ELEMS * HIST_C2_ELEMS * sizeof(histcell));
+ }
+ cquantize->needs_zeroed = TRUE; /* histogram is garbage now */
+
+ /* Allocate storage for the completed colormap, if required.
+ * We do this now since it may affect the memory manager's space
+ * calculations.
+ */
+ if (cinfo->enable_2pass_quant) {
+ /* Make sure color count is acceptable */
+ int desired = cinfo->desired_number_of_colors;
+ /* Lower bound on # of colors ... somewhat arbitrary as long as > 0 */
+ if (desired < 8)
+ ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, 8);
+ /* Make sure colormap indexes can be represented by JSAMPLEs */
+ if (desired > MAXNUMCOLORS)
+ ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
+ cquantize->sv_colormap = (*cinfo->mem->alloc_sarray)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)desired, (JDIMENSION)3);
+ cquantize->desired = desired;
+ } else
+ cquantize->sv_colormap = NULL;
+
+ /* Only F-S dithering or no dithering is supported. */
+ /* If user asks for ordered dither, give them F-S. */
+ if (cinfo->dither_mode != JDITHER_NONE)
+ cinfo->dither_mode = JDITHER_FS;
+
+ /* Allocate Floyd-Steinberg workspace if necessary.
+ * This isn't really needed until pass 2, but again it may affect the memory
+ * manager's space calculations. Although we will cope with a later change
+ * in dither_mode, we do not promise to honor max_memory_to_use if
+ * dither_mode changes.
+ */
+ if (cinfo->dither_mode == JDITHER_FS) {
+ cquantize->fserrors = (FSERRPTR)(*cinfo->mem->alloc_large)
+ ((j_common_ptr)cinfo, JPOOL_IMAGE,
+ (size_t)((cinfo->output_width + 2) * (3 * sizeof(FSERROR))));
+ /* Might as well create the error-limiting table too. */
+ init_error_limit(cinfo);
+ }
+}
+
+#endif /* QUANT_2PASS_SUPPORTED */
diff --git a/media/libjpeg/jsimd.h b/media/libjpeg/jsimd.h
new file mode 100644
index 0000000000..74d480aa2c
--- /dev/null
+++ b/media/libjpeg/jsimd.h
@@ -0,0 +1,123 @@
+/*
+ * jsimd.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, 2014, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ */
+
+#include "jchuff.h" /* Declarations shared with jcphuff.c */
+
+EXTERN(int) jsimd_can_rgb_ycc(void);
+EXTERN(int) jsimd_can_rgb_gray(void);
+EXTERN(int) jsimd_can_ycc_rgb(void);
+EXTERN(int) jsimd_can_ycc_rgb565(void);
+EXTERN(int) jsimd_c_can_null_convert(void);
+
+EXTERN(void) jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_ycc_rgb_convert(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows);
+
+EXTERN(int) jsimd_can_h2v2_downsample(void);
+EXTERN(int) jsimd_can_h2v1_downsample(void);
+
+EXTERN(void) jsimd_h2v2_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data);
+
+EXTERN(int) jsimd_can_h2v2_smooth_downsample(void);
+
+EXTERN(void) jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data);
+
+EXTERN(int) jsimd_can_h2v2_upsample(void);
+EXTERN(int) jsimd_can_h2v1_upsample(void);
+EXTERN(int) jsimd_can_int_upsample(void);
+
+EXTERN(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_int_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(int) jsimd_can_h2v2_fancy_upsample(void);
+EXTERN(int) jsimd_can_h2v1_fancy_upsample(void);
+EXTERN(int) jsimd_can_h1v2_fancy_upsample(void);
+
+EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(int) jsimd_can_h2v2_merged_upsample(void);
+EXTERN(int) jsimd_can_h2v1_merged_upsample(void);
+
+EXTERN(void) jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(int) jsimd_can_huff_encode_one_block(void);
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block(void *state, JOCTET *buffer,
+ JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl,
+ c_derived_tbl *actbl);
+
+EXTERN(int) jsimd_can_encode_mcu_AC_first_prepare(void);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ UJCOEF *values, size_t *zerobits);
+
+EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ UJCOEF *absvalues, size_t *bits);
diff --git a/media/libjpeg/jsimd_none.c b/media/libjpeg/jsimd_none.c
new file mode 100644
index 0000000000..a25db73899
--- /dev/null
+++ b/media/libjpeg/jsimd_none.c
@@ -0,0 +1,431 @@
+/*
+ * jsimd_none.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains stubs for when there is no SIMD support available.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jsimd.h"
+#include "jdct.h"
+#include "jsimddct.h"
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
diff --git a/media/libjpeg/jsimddct.h b/media/libjpeg/jsimddct.h
new file mode 100644
index 0000000000..55ee8cf67f
--- /dev/null
+++ b/media/libjpeg/jsimddct.h
@@ -0,0 +1,70 @@
+/*
+ * jsimddct.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ */
+
+EXTERN(int) jsimd_can_convsamp(void);
+EXTERN(int) jsimd_can_convsamp_float(void);
+
+EXTERN(void) jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace);
+EXTERN(void) jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace);
+
+EXTERN(int) jsimd_can_fdct_islow(void);
+EXTERN(int) jsimd_can_fdct_ifast(void);
+EXTERN(int) jsimd_can_fdct_float(void);
+
+EXTERN(void) jsimd_fdct_islow(DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast(DCTELEM *data);
+EXTERN(void) jsimd_fdct_float(FAST_FLOAT *data);
+
+EXTERN(int) jsimd_can_quantize(void);
+EXTERN(int) jsimd_can_quantize_float(void);
+
+EXTERN(void) jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors,
+ DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace);
+
+EXTERN(int) jsimd_can_idct_2x2(void);
+EXTERN(int) jsimd_can_idct_4x4(void);
+EXTERN(int) jsimd_can_idct_6x6(void);
+EXTERN(int) jsimd_can_idct_12x12(void);
+
+EXTERN(void) jsimd_idct_2x2(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_6x6(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(int) jsimd_can_idct_islow(void);
+EXTERN(int) jsimd_can_idct_ifast(void);
+EXTERN(int) jsimd_can_idct_float(void);
+
+EXTERN(void) jsimd_idct_islow(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_ifast(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_float(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
diff --git a/media/libjpeg/jstdhuff.c b/media/libjpeg/jstdhuff.c
new file mode 100644
index 0000000000..345b513d4d
--- /dev/null
+++ b/media/libjpeg/jstdhuff.c
@@ -0,0 +1,144 @@
+/*
+ * jstdhuff.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2013, 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains routines to set the default Huffman tables, if they are
+ * not already set.
+ */
+
+/*
+ * Huffman table setup routines
+ */
+
+LOCAL(void)
+add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits,
+ const UINT8 *val)
+/* Define a Huffman table */
+{
+ int nsymbols, len;
+
+ if (*htblptr == NULL)
+ *htblptr = jpeg_alloc_huff_table(cinfo);
+ else
+ return;
+
+ /* Copy the number-of-symbols-of-each-code-length counts */
+ memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
+
+ /* Validate the counts. We do this here mainly so we can copy the right
+ * number of symbols from the val[] array, without risking marching off
+ * the end of memory. jchuff.c will do a more thorough test later.
+ */
+ nsymbols = 0;
+ for (len = 1; len <= 16; len++)
+ nsymbols += bits[len];
+ if (nsymbols < 1 || nsymbols > 256)
+ ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
+
+ memcpy((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
+ memset(&((*htblptr)->huffval[nsymbols]), 0,
+ (256 - nsymbols) * sizeof(UINT8));
+
+ /* Initialize sent_table FALSE so table will be written to JPEG file. */
+ (*htblptr)->sent_table = FALSE;
+}
+
+
+LOCAL(void)
+std_huff_tables(j_common_ptr cinfo)
+/* Set up the standard Huffman tables (cf. JPEG standard section K.3) */
+/* IMPORTANT: these are only valid for 8-bit data precision! */
+{
+ JHUFF_TBL **dc_huff_tbl_ptrs, **ac_huff_tbl_ptrs;
+
+ static const UINT8 bits_dc_luminance[17] = {
+ /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0
+ };
+ static const UINT8 val_dc_luminance[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ };
+
+ static const UINT8 bits_dc_chrominance[17] = {
+ /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
+ };
+ static const UINT8 val_dc_chrominance[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ };
+
+ static const UINT8 bits_ac_luminance[17] = {
+ /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d
+ };
+ static const UINT8 val_ac_luminance[] = {
+ 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+ 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+ 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+ 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+ 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+ 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+ 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+ 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+ 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+ 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+ 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+ 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+ 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+ 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+ 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+ 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+ 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+ 0xf9, 0xfa
+ };
+
+ static const UINT8 bits_ac_chrominance[17] = {
+ /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77
+ };
+ static const UINT8 val_ac_chrominance[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+ 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+ 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+ 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+ 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+ 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+ 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+ 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+ 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+ 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+ 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+ 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+ 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+ 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+ 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+ 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+ 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+ 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+ 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+ 0xf9, 0xfa
+ };
+
+ if (cinfo->is_decompressor) {
+ dc_huff_tbl_ptrs = ((j_decompress_ptr)cinfo)->dc_huff_tbl_ptrs;
+ ac_huff_tbl_ptrs = ((j_decompress_ptr)cinfo)->ac_huff_tbl_ptrs;
+ } else {
+ dc_huff_tbl_ptrs = ((j_compress_ptr)cinfo)->dc_huff_tbl_ptrs;
+ ac_huff_tbl_ptrs = ((j_compress_ptr)cinfo)->ac_huff_tbl_ptrs;
+ }
+
+ add_huff_table(cinfo, &dc_huff_tbl_ptrs[0], bits_dc_luminance,
+ val_dc_luminance);
+ add_huff_table(cinfo, &ac_huff_tbl_ptrs[0], bits_ac_luminance,
+ val_ac_luminance);
+ add_huff_table(cinfo, &dc_huff_tbl_ptrs[1], bits_dc_chrominance,
+ val_dc_chrominance);
+ add_huff_table(cinfo, &ac_huff_tbl_ptrs[1], bits_ac_chrominance,
+ val_ac_chrominance);
+}
diff --git a/media/libjpeg/jutils.c b/media/libjpeg/jutils.c
new file mode 100644
index 0000000000..d86271624a
--- /dev/null
+++ b/media/libjpeg/jutils.c
@@ -0,0 +1,133 @@
+/*
+ * jutils.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains tables and miscellaneous utility routines needed
+ * for both compression and decompression.
+ * Note we prefix all global names with "j" to minimize conflicts with
+ * a surrounding application.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/*
+ * jpeg_zigzag_order[i] is the zigzag-order position of the i'th element
+ * of a DCT block read in natural order (left to right, top to bottom).
+ */
+
+#if 0 /* This table is not actually needed in v6a */
+
+const int jpeg_zigzag_order[DCTSIZE2] = {
+ 0, 1, 5, 6, 14, 15, 27, 28,
+ 2, 4, 7, 13, 16, 26, 29, 42,
+ 3, 8, 12, 17, 25, 30, 41, 43,
+ 9, 11, 18, 24, 31, 40, 44, 53,
+ 10, 19, 23, 32, 39, 45, 52, 54,
+ 20, 22, 33, 38, 46, 51, 55, 60,
+ 21, 34, 37, 47, 50, 56, 59, 61,
+ 35, 36, 48, 49, 57, 58, 62, 63
+};
+
+#endif
+
+/*
+ * jpeg_natural_order[i] is the natural-order position of the i'th element
+ * of zigzag order.
+ *
+ * When reading corrupted data, the Huffman decoders could attempt
+ * to reference an entry beyond the end of this array (if the decoded
+ * zero run length reaches past the end of the block). To prevent
+ * wild stores without adding an inner-loop test, we put some extra
+ * "63"s after the real entries. This will cause the extra coefficient
+ * to be stored in location 63 of the block, not somewhere random.
+ * The worst case would be a run-length of 15, which means we need 16
+ * fake entries.
+ */
+
+const int jpeg_natural_order[DCTSIZE2 + 16] = {
+ 0, 1, 8, 16, 9, 2, 3, 10,
+ 17, 24, 32, 25, 18, 11, 4, 5,
+ 12, 19, 26, 33, 40, 48, 41, 34,
+ 27, 20, 13, 6, 7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36,
+ 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46,
+ 53, 60, 61, 54, 47, 55, 62, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+
+/*
+ * Arithmetic utilities
+ */
+
+GLOBAL(long)
+jdiv_round_up(long a, long b)
+/* Compute a/b rounded up to next integer, ie, ceil(a/b) */
+/* Assumes a >= 0, b > 0 */
+{
+ return (a + b - 1L) / b;
+}
+
+
+GLOBAL(long)
+jround_up(long a, long b)
+/* Compute a rounded up to next multiple of b, ie, ceil(a/b)*b */
+/* Assumes a >= 0, b > 0 */
+{
+ a += b - 1L;
+ return a - (a % b);
+}
+
+
+GLOBAL(void)
+jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
+ JSAMPARRAY output_array, int dest_row, int num_rows,
+ JDIMENSION num_cols)
+/* Copy some rows of samples from one place to another.
+ * num_rows rows are copied from input_array[source_row++]
+ * to output_array[dest_row++]; these areas may overlap for duplication.
+ * The source and destination arrays must be at least as wide as num_cols.
+ */
+{
+ register JSAMPROW inptr, outptr;
+ register size_t count = (size_t)(num_cols * sizeof(JSAMPLE));
+ register int row;
+
+ input_array += source_row;
+ output_array += dest_row;
+
+ for (row = num_rows; row > 0; row--) {
+ inptr = *input_array++;
+ outptr = *output_array++;
+ memcpy(outptr, inptr, count);
+ }
+}
+
+
+GLOBAL(void)
+jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
+ JDIMENSION num_blocks)
+/* Copy a row of coefficient blocks from one place to another. */
+{
+ memcpy(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
+}
+
+
+GLOBAL(void)
+jzero_far(void *target, size_t bytestozero)
+/* Zero out a chunk of memory. */
+/* This might be sample-array data, block-array data, or alloc_large data. */
+{
+ memset(target, 0, bytestozero);
+}
diff --git a/media/libjpeg/jversion.h b/media/libjpeg/jversion.h
new file mode 100644
index 0000000000..ea6de648d9
--- /dev/null
+++ b/media/libjpeg/jversion.h
@@ -0,0 +1,54 @@
+/*
+ * jversion.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010, 2012-2023, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains software version identification.
+ */
+
+
+#if JPEG_LIB_VERSION >= 80
+
+#define JVERSION "8d 15-Jan-2012"
+
+#elif JPEG_LIB_VERSION >= 70
+
+#define JVERSION "7 27-Jun-2009"
+
+#else
+
+#define JVERSION "6b 27-Mar-1998"
+
+#endif
+
+/*
+ * NOTE: It is our convention to place the authors in the following order:
+ * - libjpeg-turbo authors (2009-) in descending order of the date of their
+ * most recent contribution to the project, then in ascending order of the
+ * date of their first contribution to the project, then in alphabetical
+ * order
+ * - Upstream authors in descending order of the date of the first inclusion of
+ * their code
+ */
+
+#define JCOPYRIGHT \
+ "Copyright (C) 2009-2023 D. R. Commander\n" \
+ "Copyright (C) 2015, 2020 Google, Inc.\n" \
+ "Copyright (C) 2019-2020 Arm Limited\n" \
+ "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
+ "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
+ "Copyright (C) 2015 Intel Corporation\n" \
+ "Copyright (C) 2013-2014 Linaro Limited\n" \
+ "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
+ "Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \
+ "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
+ "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
+ "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding"
+
+#define JCOPYRIGHT_SHORT \
+ "Copyright (C) 1991-2023 The libjpeg-turbo Project and many others"
diff --git a/media/libjpeg/moz.build b/media/libjpeg/moz.build
new file mode 100644
index 0000000000..9053f56765
--- /dev/null
+++ b/media/libjpeg/moz.build
@@ -0,0 +1,323 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+with Files("**"):
+ BUG_COMPONENT = ("Core", "Graphics: ImageLib")
+
+EXPORTS += [
+ 'jconfig.h',
+ 'jerror.h',
+ 'jinclude.h',
+ 'jmorecfg.h',
+ 'jpegint.h',
+ 'jpeglib.h',
+]
+
+SOURCES += [
+ 'jcomapi.c',
+ 'jdapimin.c',
+ 'jdapistd.c',
+ 'jdatadst.c',
+ 'jdatasrc.c',
+ 'jdcoefct.c',
+ 'jdcolor.c',
+ 'jddctmgr.c',
+ 'jdhuff.c',
+ 'jdicc.c',
+ 'jdinput.c',
+ 'jdmainct.c',
+ 'jdmarker.c',
+ 'jdmaster.c',
+ 'jdmerge.c',
+ 'jdphuff.c',
+ 'jdpostct.c',
+ 'jdsample.c',
+ 'jdtrans.c',
+ 'jerror.c',
+ 'jfdctflt.c',
+ 'jfdctfst.c',
+ 'jfdctint.c',
+ 'jidctflt.c',
+ 'jidctfst.c',
+ 'jidctint.c',
+ 'jidctred.c',
+ 'jmemmgr.c',
+ 'jmemnobs.c',
+ 'jquant1.c',
+ 'jquant2.c',
+ 'jutils.c',
+]
+
+# These files enable support for writing JPEGs
+SOURCES += [
+ 'jcapimin.c',
+ 'jcapistd.c',
+ 'jccoefct.c',
+ 'jccolor.c',
+ 'jcdctmgr.c',
+ 'jchuff.c',
+ 'jcicc.c',
+ 'jcinit.c',
+ 'jcmainct.c',
+ 'jcmarker.c',
+ 'jcmaster.c',
+ 'jcparam.c',
+ 'jcphuff.c',
+ 'jcprepct.c',
+ 'jcsample.c',
+ 'jctrans.c',
+]
+
+if CONFIG['LIBJPEG_TURBO_HAVE_VLD1_S16_X3']:
+ DEFINES['HAVE_VLD1_S16_X3'] = True
+
+if CONFIG['LIBJPEG_TURBO_HAVE_VLD1_U16_X2']:
+ DEFINES['HAVE_VLD1_U16_X2'] = True
+
+if CONFIG['LIBJPEG_TURBO_HAVE_VLD1Q_U8_X4']:
+ DEFINES['HAVE_VLD1Q_U8_X4'] = True
+
+if CONFIG['LIBJPEG_TURBO_NEON_INTRINSICS']:
+ DEFINES['NEON_INTRINSICS'] = True
+
+if CONFIG['LIBJPEG_TURBO_USE_NASM']:
+ USE_NASM = True
+
+if CONFIG['LIBJPEG_TURBO_SIMD_FLAGS']:
+ if CONFIG['CPU_ARCH'] == 'arm':
+ LOCAL_INCLUDES += [
+ '/media/libjpeg/simd/arm',
+ '/media/libjpeg/simd/arm/aarch32',
+ ]
+ SOURCES += [
+ 'simd/arm/aarch32/jsimd.c',
+ ]
+
+ simd_sources = [
+ 'simd/arm/aarch32/jchuff-neon.c',
+ 'simd/arm/jcgray-neon.c',
+ 'simd/arm/jcphuff-neon.c',
+ 'simd/arm/jcsample-neon.c',
+ 'simd/arm/jdcolor-neon.c',
+ 'simd/arm/jdmerge-neon.c',
+ 'simd/arm/jdsample-neon.c',
+ 'simd/arm/jfdctfst-neon.c',
+ 'simd/arm/jfdctint-neon.c',
+ 'simd/arm/jidctred-neon.c',
+ 'simd/arm/jquanti-neon.c',
+ ]
+ SOURCES += simd_sources
+
+ if CONFIG['LIBJPEG_TURBO_NEON_INTRINSICS']:
+ simd_sources_intrinsics = [
+ 'simd/arm/jccolor-neon.c',
+ 'simd/arm/jidctfst-neon.c',
+ 'simd/arm/jidctint-neon.c',
+ ]
+ SOURCES += simd_sources_intrinsics
+ simd_sources += simd_sources_intrinsics
+ else:
+ SOURCES += [
+ 'simd/arm/aarch32/jsimd_neon.S',
+ ]
+
+ for srcfile in simd_sources:
+ SOURCES[srcfile].flags += CONFIG['LIBJPEG_TURBO_SIMD_FLAGS']
+ elif CONFIG['CPU_ARCH'] == 'aarch64':
+ LOCAL_INCLUDES += [
+ '/media/libjpeg/simd/arm',
+ '/media/libjpeg/simd/arm/aarch64',
+ ]
+ SOURCES += [
+ 'simd/arm/aarch64/jsimd.c',
+ ]
+
+ simd_sources = [
+ 'simd/arm/jcgray-neon.c',
+ 'simd/arm/jcphuff-neon.c',
+ 'simd/arm/jcsample-neon.c',
+ 'simd/arm/jdmerge-neon.c',
+ 'simd/arm/jdsample-neon.c',
+ 'simd/arm/jfdctfst-neon.c',
+ 'simd/arm/jidctfst-neon.c',
+ 'simd/arm/jidctred-neon.c',
+ 'simd/arm/jquanti-neon.c',
+ ]
+ SOURCES += simd_sources
+
+ if CONFIG['LIBJPEG_TURBO_NEON_INTRINSICS']:
+ simd_sources_intrinsics = [
+ 'simd/arm/aarch64/jchuff-neon.c',
+ 'simd/arm/jccolor-neon.c',
+ 'simd/arm/jdcolor-neon.c',
+ 'simd/arm/jfdctint-neon.c',
+ 'simd/arm/jidctint-neon.c',
+ ]
+ SOURCES += simd_sources_intrinsics
+ simd_sources += simd_sources_intrinsics
+ else:
+ SOURCES += [
+ 'simd/arm/aarch64/jsimd_neon.S',
+ ]
+
+ for srcfile in simd_sources:
+ SOURCES[srcfile].flags += CONFIG['LIBJPEG_TURBO_SIMD_FLAGS']
+ elif CONFIG['CPU_ARCH'] == 'mips32':
+ SOURCES += [
+ 'simd/mips/jsimd.c',
+ 'simd/mips/jsimd_dspr2.S',
+ ]
+ if CONFIG['CC_TYPE'] == 'clang':
+ SOURCES['simd/mips/jsimd_dspr2.S'].flags += [
+ '-fno-integrated-as',
+ ]
+ elif CONFIG['CPU_ARCH'] == 'mips64':
+ LOCAL_INCLUDES += ['/media/libjpeg/simd/mips64']
+ simd_sources = [
+ 'simd/mips64/jccolor-mmi.c',
+ 'simd/mips64/jcgray-mmi.c',
+ 'simd/mips64/jcsample-mmi.c',
+ 'simd/mips64/jdcolor-mmi.c',
+ 'simd/mips64/jdmerge-mmi.c',
+ 'simd/mips64/jdsample-mmi.c',
+ 'simd/mips64/jfdctfst-mmi.c',
+ 'simd/mips64/jfdctint-mmi.c',
+ 'simd/mips64/jidctfst-mmi.c',
+ 'simd/mips64/jidctint-mmi.c',
+ 'simd/mips64/jquanti-mmi.c',
+ ]
+ SOURCES += simd_sources
+ SOURCES += [
+ 'simd/mips64/jsimd.c'
+ ]
+ for srcfile in simd_sources:
+ SOURCES[srcfile].flags += CONFIG['LIBJPEG_TURBO_SIMD_FLAGS']
+ elif CONFIG['CPU_ARCH'] == 'x86_64':
+ SOURCES += [
+ 'simd/x86_64/jccolor-avx2.asm',
+ 'simd/x86_64/jccolor-sse2.asm',
+ 'simd/x86_64/jcgray-avx2.asm',
+ 'simd/x86_64/jcgray-sse2.asm',
+ 'simd/x86_64/jchuff-sse2.asm',
+ 'simd/x86_64/jcphuff-sse2.asm',
+ 'simd/x86_64/jcsample-avx2.asm',
+ 'simd/x86_64/jcsample-sse2.asm',
+ 'simd/x86_64/jdcolor-avx2.asm',
+ 'simd/x86_64/jdcolor-sse2.asm',
+ 'simd/x86_64/jdmerge-avx2.asm',
+ 'simd/x86_64/jdmerge-sse2.asm',
+ 'simd/x86_64/jdsample-avx2.asm',
+ 'simd/x86_64/jdsample-sse2.asm',
+ 'simd/x86_64/jfdctflt-sse.asm',
+ 'simd/x86_64/jfdctfst-sse2.asm',
+ 'simd/x86_64/jfdctint-avx2.asm',
+ 'simd/x86_64/jfdctint-sse2.asm',
+ 'simd/x86_64/jidctflt-sse2.asm',
+ 'simd/x86_64/jidctfst-sse2.asm',
+ 'simd/x86_64/jidctint-avx2.asm',
+ 'simd/x86_64/jidctint-sse2.asm',
+ 'simd/x86_64/jidctred-sse2.asm',
+ 'simd/x86_64/jquantf-sse2.asm',
+ 'simd/x86_64/jquanti-avx2.asm',
+ 'simd/x86_64/jquanti-sse2.asm',
+ 'simd/x86_64/jsimd.c',
+ 'simd/x86_64/jsimdcpu.asm',
+ ]
+ elif CONFIG['CPU_ARCH'] == 'x86':
+ SOURCES += [
+ 'simd/i386/jccolor-avx2.asm',
+ 'simd/i386/jccolor-mmx.asm',
+ 'simd/i386/jccolor-sse2.asm',
+ 'simd/i386/jcgray-avx2.asm',
+ 'simd/i386/jcgray-mmx.asm',
+ 'simd/i386/jcgray-sse2.asm',
+ 'simd/i386/jchuff-sse2.asm',
+ 'simd/i386/jcphuff-sse2.asm',
+ 'simd/i386/jcsample-avx2.asm',
+ 'simd/i386/jcsample-mmx.asm',
+ 'simd/i386/jcsample-sse2.asm',
+ 'simd/i386/jdcolor-avx2.asm',
+ 'simd/i386/jdcolor-mmx.asm',
+ 'simd/i386/jdcolor-sse2.asm',
+ 'simd/i386/jdmerge-avx2.asm',
+ 'simd/i386/jdmerge-mmx.asm',
+ 'simd/i386/jdmerge-sse2.asm',
+ 'simd/i386/jdsample-avx2.asm',
+ 'simd/i386/jdsample-mmx.asm',
+ 'simd/i386/jdsample-sse2.asm',
+ 'simd/i386/jfdctflt-3dn.asm',
+ 'simd/i386/jfdctflt-sse.asm',
+ 'simd/i386/jfdctfst-mmx.asm',
+ 'simd/i386/jfdctfst-sse2.asm',
+ 'simd/i386/jfdctint-avx2.asm',
+ 'simd/i386/jfdctint-mmx.asm',
+ 'simd/i386/jfdctint-sse2.asm',
+ 'simd/i386/jidctflt-3dn.asm',
+ 'simd/i386/jidctflt-sse.asm',
+ 'simd/i386/jidctflt-sse2.asm',
+ 'simd/i386/jidctfst-mmx.asm',
+ 'simd/i386/jidctfst-sse2.asm',
+ 'simd/i386/jidctint-avx2.asm',
+ 'simd/i386/jidctint-mmx.asm',
+ 'simd/i386/jidctint-sse2.asm',
+ 'simd/i386/jidctred-mmx.asm',
+ 'simd/i386/jidctred-sse2.asm',
+ 'simd/i386/jquant-3dn.asm',
+ 'simd/i386/jquant-mmx.asm',
+ 'simd/i386/jquant-sse.asm',
+ 'simd/i386/jquantf-sse2.asm',
+ 'simd/i386/jquanti-avx2.asm',
+ 'simd/i386/jquanti-sse2.asm',
+ 'simd/i386/jsimd.c',
+ 'simd/i386/jsimdcpu.asm',
+ ]
+elif CONFIG['CPU_ARCH'].startswith('ppc'):
+ # PowerPC has no assembly files, but still needs its own headers.
+ LOCAL_INCLUDES += ['/media/libjpeg/simd/powerpc']
+
+ # For libjpeg's internal runtime detection to work, jsimd.c must NOT
+ # be compiled with -maltivec (otherwise it gets statically set),
+ # but everything else should be. If -maltivec was already
+ # specified in .mozconfig, though, then this won't harm anything.
+ ppc_vmx_sources = [
+ 'simd/powerpc/jccolor-altivec.c',
+ 'simd/powerpc/jcgray-altivec.c',
+ 'simd/powerpc/jcsample-altivec.c',
+ 'simd/powerpc/jdcolor-altivec.c',
+ 'simd/powerpc/jdmerge-altivec.c',
+ 'simd/powerpc/jdsample-altivec.c',
+ 'simd/powerpc/jfdctfst-altivec.c',
+ 'simd/powerpc/jfdctint-altivec.c',
+ 'simd/powerpc/jidctfst-altivec.c',
+ 'simd/powerpc/jidctint-altivec.c',
+ 'simd/powerpc/jquanti-altivec.c',
+ ]
+ SOURCES += ppc_vmx_sources
+ SOURCES += [
+ 'simd/powerpc/jsimd.c',
+ ]
+ for srcfile in ppc_vmx_sources:
+ SOURCES[srcfile].flags += CONFIG['PPC_VMX_FLAGS']
+else: # No SIMD support?
+ SOURCES += [
+ 'jsimd_none.c',
+ ]
+
+ASFLAGS += CONFIG['LIBJPEG_TURBO_SIMD_FLAGS']
+
+# Make sure the x86 & x86-64 ASM files can see the necessary includes.
+if CONFIG['CPU_ARCH'] == 'x86':
+ ASFLAGS += ['-I%s/media/libjpeg/simd/nasm/' % TOPSRCDIR]
+ ASFLAGS += ['-I%s/media/libjpeg/simd/i386/' % TOPSRCDIR]
+if CONFIG['CPU_ARCH'] == 'x86_64':
+ ASFLAGS += ['-I%s/media/libjpeg/simd/nasm/' % TOPSRCDIR]
+ ASFLAGS += ['-I%s/media/libjpeg/simd/x86_64/' % TOPSRCDIR]
+
+# We allow warnings for third-party code that can be updated from upstream.
+AllowCompilerWarnings()
+
+FINAL_LIBRARY = 'gkmedias'
+
diff --git a/media/libjpeg/mozilla.diff b/media/libjpeg/mozilla.diff
new file mode 100644
index 0000000000..bc1bcb3066
--- /dev/null
+++ b/media/libjpeg/mozilla.diff
@@ -0,0 +1,59 @@
+diff --git jmorecfg.h jmorecfg.h
+--- jmorecfg.h
++++ jmorecfg.h
+@@ -13,8 +13,9 @@
+ * JPEG software for special applications or support machine-dependent
+ * optimizations. Most users will not need to touch this file.
+ */
+
++#include <stdint.h>
+
+ /*
+ * Maximum number of components (color channels) allowed in JPEG image.
+ * To meet the letter of Rec. ITU-T T.81 | ISO/IEC 10918-1, set this to 255.
+@@ -95,23 +96,17 @@ typedef unsigned char JOCTET;
+ */
+
+ /* UINT8 must hold at least the values 0..255. */
+
+-typedef unsigned char UINT8;
++typedef uint8_t UINT8;
+
+ /* UINT16 must hold at least the values 0..65535. */
+
+-#ifdef HAVE_UNSIGNED_SHORT
+-typedef unsigned short UINT16;
+-#else /* not HAVE_UNSIGNED_SHORT */
+-typedef unsigned int UINT16;
+-#endif /* HAVE_UNSIGNED_SHORT */
++typedef uint16_t UINT16;
+
+ /* INT16 must hold at least the values -32768..32767. */
+
+-#ifndef XMD_H /* X11/xmd.h correctly defines INT16 */
+-typedef short INT16;
+-#endif
++typedef int16_t INT16;
+
+ /* INT32 must hold at least signed 32-bit values.
+ *
+ * NOTE: The INT32 typedef dates back to libjpeg v5 (1994.) Integers were
+@@ -136,17 +131,9 @@ typedef short INT16;
+ * for internal use, which ensures that internal behavior will always be the
+ * same regardless of any external headers that may be included.
+ */
+
+-#ifndef XMD_H /* X11/xmd.h correctly defines INT32 */
+-#ifndef _BASETSD_H_ /* Microsoft defines it in basetsd.h */
+-#ifndef _BASETSD_H /* MinGW is slightly different */
+-#ifndef QGLOBAL_H /* Qt defines it in qglobal.h */
+-typedef long INT32;
+-#endif
+-#endif
+-#endif
+-#endif
++typedef int32_t INT32;
+
+ /* Datatype used for image dimensions. The JPEG standard only supports
+ * images up to 64K*64K due to 16-bit fields in SOF markers. Therefore
+ * "unsigned int" is sufficient on all machines. However, if you need to
diff --git a/media/libjpeg/simd/arm/aarch32/jccolext-neon.c b/media/libjpeg/simd/arm/aarch32/jccolext-neon.c
new file mode 100644
index 0000000000..362102d2b2
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jccolext-neon.c
@@ -0,0 +1,148 @@
+/*
+ * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * 0.16874695 = 11059 * 2^-16
+ * 0.33125305 = 21709 * 2^-16
+ * 0.50000000 = 32768 * 2^-16
+ * 0.41868592 = 27439 * 2^-16
+ * 0.08131409 = 5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ /* Pointer to RGB(X/A) input data */
+ JSAMPROW inptr;
+ /* Pointers to Y, Cb, and Cr output data */
+ JSAMPROW outptr0, outptr1, outptr2;
+ /* Allocate temporary buffer for final (image_width % 8) pixels in row. */
+ ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+
+ /* Set up conversion constants. */
+#ifdef HAVE_VLD1_U16_X2
+ const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
+ const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
+ const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
+ const uint16x4x2_t consts = { { consts1, consts2 } };
+#endif
+ const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining > 0; cols_remaining -= 8) {
+
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 8) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
+ if (cols_remaining < 8) {
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+ }
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+ uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+ uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+ uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+ uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
+ y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
+ y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
+ uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
+ y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
+ y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_low = scaled_128_5;
+ cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
+ cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
+ cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
+ uint32x4_t cb_high = scaled_128_5;
+ cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
+ cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
+ cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_low = scaled_128_5;
+ cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
+ cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
+ cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
+ uint32x4_t cr_high = scaled_128_5;
+ cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
+ cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
+ cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
+ vrshrn_n_u32(y_high, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
+ vshrn_n_u32(cb_high, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
+ vshrn_n_u32(cr_high, 16));
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1_u8(outptr0, vmovn_u16(y_u16));
+ vst1_u8(outptr1, vmovn_u16(cb_u16));
+ vst1_u8(outptr2, vmovn_u16(cr_u16));
+
+ /* Increment pointers. */
+ inptr += (8 * RGB_PIXELSIZE);
+ outptr0 += 8;
+ outptr1 += 8;
+ outptr2 += 8;
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jchuff-neon.c b/media/libjpeg/simd/arm/aarch32/jchuff-neon.c
new file mode 100644
index 0000000000..19d94f720d
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jchuff-neon.c
@@ -0,0 +1,334 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+ JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ uint8_t block_nbits[DCTSIZE2];
+ uint16_t block_diff[DCTSIZE2];
+
+ /* Load rows of coefficients from DCT block in zig-zag order. */
+
+ /* Compute DC coefficient difference value. (F.1.1.5.1) */
+ int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
+ row0 = vld1q_lane_s16(block + 1, row0, 1);
+ row0 = vld1q_lane_s16(block + 8, row0, 2);
+ row0 = vld1q_lane_s16(block + 16, row0, 3);
+ row0 = vld1q_lane_s16(block + 9, row0, 4);
+ row0 = vld1q_lane_s16(block + 2, row0, 5);
+ row0 = vld1q_lane_s16(block + 3, row0, 6);
+ row0 = vld1q_lane_s16(block + 10, row0, 7);
+
+ int16x8_t row1 = vld1q_dup_s16(block + 17);
+ row1 = vld1q_lane_s16(block + 24, row1, 1);
+ row1 = vld1q_lane_s16(block + 32, row1, 2);
+ row1 = vld1q_lane_s16(block + 25, row1, 3);
+ row1 = vld1q_lane_s16(block + 18, row1, 4);
+ row1 = vld1q_lane_s16(block + 11, row1, 5);
+ row1 = vld1q_lane_s16(block + 4, row1, 6);
+ row1 = vld1q_lane_s16(block + 5, row1, 7);
+
+ int16x8_t row2 = vld1q_dup_s16(block + 12);
+ row2 = vld1q_lane_s16(block + 19, row2, 1);
+ row2 = vld1q_lane_s16(block + 26, row2, 2);
+ row2 = vld1q_lane_s16(block + 33, row2, 3);
+ row2 = vld1q_lane_s16(block + 40, row2, 4);
+ row2 = vld1q_lane_s16(block + 48, row2, 5);
+ row2 = vld1q_lane_s16(block + 41, row2, 6);
+ row2 = vld1q_lane_s16(block + 34, row2, 7);
+
+ int16x8_t row3 = vld1q_dup_s16(block + 27);
+ row3 = vld1q_lane_s16(block + 20, row3, 1);
+ row3 = vld1q_lane_s16(block + 13, row3, 2);
+ row3 = vld1q_lane_s16(block + 6, row3, 3);
+ row3 = vld1q_lane_s16(block + 7, row3, 4);
+ row3 = vld1q_lane_s16(block + 14, row3, 5);
+ row3 = vld1q_lane_s16(block + 21, row3, 6);
+ row3 = vld1q_lane_s16(block + 28, row3, 7);
+
+ int16x8_t abs_row0 = vabsq_s16(row0);
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+
+ int16x8_t row0_lz = vclzq_s16(abs_row0);
+ int16x8_t row1_lz = vclzq_s16(abs_row1);
+ int16x8_t row2_lz = vclzq_s16(abs_row2);
+ int16x8_t row3_lz = vclzq_s16(abs_row3);
+
+ /* Compute number of bits required to represent each coefficient. */
+ uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+ uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+ uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+ uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+
+ vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+ vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+ vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+ vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+
+ uint16x8_t row0_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
+ vnegq_s16(row0_lz));
+ uint16x8_t row1_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
+ vnegq_s16(row1_lz));
+ uint16x8_t row2_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
+ vnegq_s16(row2_lz));
+ uint16x8_t row3_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
+ vnegq_s16(row3_lz));
+
+ uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+ uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
+ uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
+ uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
+
+ /* Store diff values for rows 0, 1, 2, and 3. */
+ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+ vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+ vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+ vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+
+ /* Load last four rows of coefficients from DCT block in zig-zag order. */
+ int16x8_t row4 = vld1q_dup_s16(block + 35);
+ row4 = vld1q_lane_s16(block + 42, row4, 1);
+ row4 = vld1q_lane_s16(block + 49, row4, 2);
+ row4 = vld1q_lane_s16(block + 56, row4, 3);
+ row4 = vld1q_lane_s16(block + 57, row4, 4);
+ row4 = vld1q_lane_s16(block + 50, row4, 5);
+ row4 = vld1q_lane_s16(block + 43, row4, 6);
+ row4 = vld1q_lane_s16(block + 36, row4, 7);
+
+ int16x8_t row5 = vld1q_dup_s16(block + 29);
+ row5 = vld1q_lane_s16(block + 22, row5, 1);
+ row5 = vld1q_lane_s16(block + 15, row5, 2);
+ row5 = vld1q_lane_s16(block + 23, row5, 3);
+ row5 = vld1q_lane_s16(block + 30, row5, 4);
+ row5 = vld1q_lane_s16(block + 37, row5, 5);
+ row5 = vld1q_lane_s16(block + 44, row5, 6);
+ row5 = vld1q_lane_s16(block + 51, row5, 7);
+
+ int16x8_t row6 = vld1q_dup_s16(block + 58);
+ row6 = vld1q_lane_s16(block + 59, row6, 1);
+ row6 = vld1q_lane_s16(block + 52, row6, 2);
+ row6 = vld1q_lane_s16(block + 45, row6, 3);
+ row6 = vld1q_lane_s16(block + 38, row6, 4);
+ row6 = vld1q_lane_s16(block + 31, row6, 5);
+ row6 = vld1q_lane_s16(block + 39, row6, 6);
+ row6 = vld1q_lane_s16(block + 46, row6, 7);
+
+ int16x8_t row7 = vld1q_dup_s16(block + 53);
+ row7 = vld1q_lane_s16(block + 60, row7, 1);
+ row7 = vld1q_lane_s16(block + 61, row7, 2);
+ row7 = vld1q_lane_s16(block + 54, row7, 3);
+ row7 = vld1q_lane_s16(block + 47, row7, 4);
+ row7 = vld1q_lane_s16(block + 55, row7, 5);
+ row7 = vld1q_lane_s16(block + 62, row7, 6);
+ row7 = vld1q_lane_s16(block + 63, row7, 7);
+
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
+
+ int16x8_t row4_lz = vclzq_s16(abs_row4);
+ int16x8_t row5_lz = vclzq_s16(abs_row5);
+ int16x8_t row6_lz = vclzq_s16(abs_row6);
+ int16x8_t row7_lz = vclzq_s16(abs_row7);
+
+ /* Compute number of bits required to represent each coefficient. */
+ uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+ uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+ uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+ uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+
+ vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+ vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+ vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+ vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+
+ uint16x8_t row4_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
+ vnegq_s16(row4_lz));
+ uint16x8_t row5_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
+ vnegq_s16(row5_lz));
+ uint16x8_t row6_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
+ vnegq_s16(row6_lz));
+ uint16x8_t row7_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
+ vnegq_s16(row7_lz));
+
+ uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
+ uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
+ uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
+ uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
+
+ /* Store diff values for rows 4, 5, 6, and 7. */
+ vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+ vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+ vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+ vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+ /* Construct bitmap to accelerate encoding of AC coefficients. A set bit
+ * means that the corresponding coefficient != 0.
+ */
+ uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
+ uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
+ uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
+ uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
+ uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
+ uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
+ uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
+ uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
+
+ /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+
+ row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
+ row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
+ row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
+ row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
+ row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
+ row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
+ row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
+ row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
+ uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
+ uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
+ uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
+ uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+ uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+ uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+ /* Shift left to remove DC bit. */
+ bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
+ /* Move bitmap to 32-bit scalar registers. */
+ uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
+ uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
+
+ /* Set up state and bit buffer for output bitstream. */
+ working_state *state_ptr = (working_state *)state;
+ int free_bits = state_ptr->cur.free_bits;
+ size_t put_buffer = state_ptr->cur.put_buffer;
+
+ /* Encode DC coefficient. */
+
+ unsigned int nbits = block_nbits[0];
+ /* Emit Huffman-coded symbol and additional diff bits. */
+ unsigned int diff = block_diff[0];
+ PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+ /* Encode AC coefficients. */
+
+ unsigned int r = 0; /* r = run length of zeros */
+ unsigned int i = 1; /* i = number of coefficients encoded */
+ /* Code and size information for a run length of 16 zero coefficients */
+ const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+ const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+ while (bitmap_1_32 != 0) {
+ r = BUILTIN_CLZ(bitmap_1_32);
+ i += r;
+ bitmap_1_32 <<= r;
+ nbits = block_nbits[i];
+ diff = block_diff[i];
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ i++;
+ bitmap_1_32 <<= 1;
+ }
+
+ r = 33 - i;
+ i = 33;
+
+ while (bitmap_33_63 != 0) {
+ unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
+ r += leading_zeros;
+ i += leading_zeros;
+ bitmap_33_63 <<= leading_zeros;
+ nbits = block_nbits[i];
+ diff = block_diff[i];
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ r = 0;
+ i++;
+ bitmap_33_63 <<= 1;
+ }
+
+ /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+ * The value of RS for the EOB code is 0.
+ */
+ if (i != 64) {
+ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+ }
+
+ state_ptr->cur.put_buffer = put_buffer;
+ state_ptr->cur.free_bits = free_bits;
+
+ return buffer;
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jsimd.c b/media/libjpeg/simd/arm/aarch32/jsimd.c
new file mode 100644
index 0000000000..04d64526fb
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jsimd.c
@@ -0,0 +1,976 @@
+/*
+ * jsimd_arm.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ * Copyright (C) 2019, Google LLC.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <ctype.h>
+
+static THREAD_LOCAL unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_huffman = 1;
+
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+ char *p;
+
+ if (*feature == 0)
+ return 0;
+ if (strncmp(buffer, "Features", 8) != 0)
+ return 0;
+ buffer += 8;
+ while (isspace(*buffer))
+ buffer++;
+
+ /* Check if 'feature' is present in the buffer as a separate word */
+ while ((p = strstr(buffer, feature))) {
+ if (p > buffer && !isspace(*(p - 1))) {
+ buffer++;
+ continue;
+ }
+ p += strlen(feature);
+ if (*p != 0 && !isspace(*p)) {
+ buffer++;
+ continue;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+ char *buffer = (char *)malloc(bufsize);
+ FILE *fd;
+
+ simd_support = 0;
+
+ if (!buffer)
+ return 0;
+
+ fd = fopen("/proc/cpuinfo", "r");
+ if (fd) {
+ while (fgets(buffer, bufsize, fd)) {
+ if (!strchr(buffer, '\n') && !feof(fd)) {
+ /* "impossible" happened - insufficient size of the buffer! */
+ fclose(fd);
+ free(buffer);
+ return 0;
+ }
+ if (check_feature(buffer, "neon"))
+ simd_support |= JSIMD_NEON;
+ }
+ fclose(fd);
+ }
+ free(buffer);
+ return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char env[2] = { 0 };
+#endif
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+ int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+#if defined(__ARM_NEON__)
+ simd_support |= JSIMD_NEON;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+ /* We still have a chance to use Neon regardless of globally used
+ * -mcpu/-mfpu options passed to gcc by performing runtime detection via
+ * /proc/cpuinfo parsing on linux/android */
+ while (!parse_proc_cpuinfo(bufsize)) {
+ bufsize *= 2;
+ if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+ break;
+ }
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+ simd_support = JSIMD_NEON;
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+ simd_support = 0;
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+ simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_ycc_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_extbgr_ycc_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_ycc_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_ycc_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_ycc_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_gray_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_extbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_gray_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_gray_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_ycc_extrgbx_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_ycc_extbgr_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_ycc_extbgrx_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_ycc_extxbgr_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_ycc_extxrgb_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+ output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON && simd_huffman)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+ jpeg_natural_order_start, Sl,
+ Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jsimd_neon.S b/media/libjpeg/simd/arm/aarch32/jsimd_neon.S
new file mode 100644
index 0000000000..7e1e2b1451
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jsimd_neon.S
@@ -0,0 +1,1200 @@
+/*
+ * Armv7 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ * All Rights Reserved.
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
+ * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.arm
+.syntax unified
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+ .private_extern _\fname
+ .globl _\fname
+_\fname:
+#else
+ .global \fname
+#ifdef __ELF__
+ .hidden \fname
+ .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+
+#define CENTERJSAMPLE 128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ * JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define FIX_0_298631336 (2446)
+#define FIX_0_390180644 (3196)
+#define FIX_0_541196100 (4433)
+#define FIX_0_765366865 (6270)
+#define FIX_0_899976223 (7373)
+#define FIX_1_175875602 (9633)
+#define FIX_1_501321110 (12299)
+#define FIX_1_847759065 (15137)
+#define FIX_1_961570560 (16069)
+#define FIX_2_053119869 (16819)
+#define FIX_2_562915447 (20995)
+#define FIX_3_072711026 (25172)
+
+#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
+#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
+#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
+#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
+#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
+#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
+#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
+#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
+
+/*
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
+ */
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
+ DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
+ JLONG q1, q2, q3, q4, q5, q6, q7; \
+ JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
+ \
+ /* 1-D iDCT input data */ \
+ row0 = xrow0; \
+ row1 = xrow1; \
+ row2 = xrow2; \
+ row3 = xrow3; \
+ row4 = xrow4; \
+ row5 = xrow5; \
+ row6 = xrow6; \
+ row7 = xrow7; \
+ \
+ q5 = row7 + row3; \
+ q4 = row5 + row1; \
+ q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
+ MULTIPLY(q4, FIX_1_175875602); \
+ q7 = MULTIPLY(q5, FIX_1_175875602) + \
+ MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
+ q2 = MULTIPLY(row2, FIX_0_541196100) + \
+ MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
+ q4 = q6; \
+ q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
+ q6 += MULTIPLY(row5, -FIX_2_562915447) + \
+ MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
+ /* now we can use q1 (reloadable constants have been used up) */ \
+ q1 = q3 + q2; \
+ q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
+ MULTIPLY(row1, -FIX_0_899976223); \
+ q5 = q7; \
+ q1 = q1 + q6; \
+ q7 += MULTIPLY(row7, -FIX_0_899976223) + \
+ MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
+ \
+ /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
+ tmp11_plus_tmp2 = q1; \
+ row1 = 0; \
+ \
+ q1 = q1 - q6; \
+ q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
+ MULTIPLY(row3, -FIX_2_562915447); \
+ q1 = q1 - q6; \
+ q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
+ MULTIPLY(row6, FIX_0_541196100); \
+ q3 = q3 - q2; \
+ \
+ /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
+ tmp11_minus_tmp2 = q1; \
+ \
+ q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
+ q2 = q1 + q6; \
+ q1 = q1 - q6; \
+ \
+ /* pick up the results */ \
+ tmp0 = q4; \
+ tmp1 = q5; \
+ tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
+ tmp3 = q7; \
+ tmp10 = q2; \
+ tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
+ tmp12 = q3; \
+ tmp13 = q1; \
+}
+
+#define XFIX_0_899976223 d0[0]
+#define XFIX_0_541196100 d0[1]
+#define XFIX_2_562915447 d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865 d1[2]
+#define XFIX_1_175875602 d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
+
+.balign 16
+jsimd_idct_islow_neon_consts:
+ .short FIX_0_899976223 /* d0[0] */
+ .short FIX_0_541196100 /* d0[1] */
+ .short FIX_2_562915447 /* d0[2] */
+ .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
+ .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
+ .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
+ .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
+ .short FIX_1_175875602 /* d1[3] */
+ /* reloadable constants */
+ .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
+ .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
+ .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
+ .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
+
+asm_function jsimd_idct_islow_neon
+
+ DCT_TABLE .req r0
+ COEF_BLOCK .req r1
+ OUTPUT_BUF .req r2
+ OUTPUT_COL .req r3
+ TMP1 .req r0
+ TMP2 .req r1
+ TMP3 .req r2
+ TMP4 .req ip
+
+ ROW0L .req d16
+ ROW0R .req d17
+ ROW1L .req d18
+ ROW1R .req d19
+ ROW2L .req d20
+ ROW2R .req d21
+ ROW3L .req d22
+ ROW3R .req d23
+ ROW4L .req d24
+ ROW4R .req d25
+ ROW5L .req d26
+ ROW5R .req d27
+ ROW6L .req d28
+ ROW6R .req d29
+ ROW7L .req d30
+ ROW7R .req d31
+
+ /* Load and dequantize coefficients into Neon registers
+ * with the following allocation:
+ * 0 1 2 3 | 4 5 6 7
+ * ---------+--------
+ * 0 | d16 | d17 ( q8 )
+ * 1 | d18 | d19 ( q9 )
+ * 2 | d20 | d21 ( q10 )
+ * 3 | d22 | d23 ( q11 )
+ * 4 | d24 | d25 ( q12 )
+ * 5 | d26 | d27 ( q13 )
+ * 6 | d28 | d29 ( q14 )
+ * 7 | d30 | d31 ( q15 )
+ */
+ adr ip, jsimd_idct_islow_neon_consts
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+ vmul.s16 q8, q8, q0
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+ vmul.s16 q9, q9, q1
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+ vmul.s16 q10, q10, q2
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+ vmul.s16 q11, q11, q3
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+ vmul.s16 q12, q12, q0
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+ vmul.s16 q14, q14, q2
+ vmul.s16 q13, q13, q1
+ vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
+ add ip, ip, #16
+ vmul.s16 q15, q15, q3
+ vpush {d8 - d15} /* save Neon registers */
+ /* 1-D IDCT, pass 1, left 4x8 half */
+ vadd.s16 d4, ROW7L, ROW3L
+ vadd.s16 d5, ROW5L, ROW1L
+ vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
+ vmlal.s16 q6, d5, XFIX_1_175875602
+ vmull.s16 q7, d4, XFIX_1_175875602
+ /* Check for the zero coefficients in the right 4x8 half */
+ push {r4, r5}
+ vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
+ vsubl.s16 q3, ROW0L, ROW4L
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
+ vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+ orr r0, r4, r5
+ vmov q4, q6
+ vmlsl.s16 q6, ROW5L, XFIX_2_562915447
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+ vshl.s32 q3, q3, #13
+ orr r0, r0, r4
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
+ orr r0, r0, r5
+ vadd.s32 q1, q3, q2
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+ vmov q5, q7
+ vadd.s32 q1, q1, q6
+ orr r0, r0, r4
+ vmlsl.s16 q7, ROW7L, XFIX_0_899976223
+ orr r0, r0, r5
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+ vrshrn.s32 ROW1L, q1, #11
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+ vsub.s32 q1, q1, q6
+ vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+ orr r0, r0, r4
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
+ orr r0, r0, r5
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+ vmlal.s16 q6, ROW6L, XFIX_0_541196100
+ vsub.s32 q3, q3, q2
+ orr r0, r0, r4
+ vrshrn.s32 ROW6L, q1, #11
+ orr r0, r0, r5
+ vadd.s32 q1, q3, q5
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+ vsub.s32 q3, q3, q5
+ vaddl.s16 q5, ROW0L, ROW4L
+ orr r0, r0, r4
+ vrshrn.s32 ROW2L, q1, #11
+ orr r0, r0, r5
+ vrshrn.s32 ROW5L, q3, #11
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+ vshl.s32 q5, q5, #13
+ vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+ orr r0, r0, r4
+ vadd.s32 q2, q5, q6
+ orrs r0, r0, r5
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ orr r0, r4, r5
+ vsub.s32 q3, q1, q4
+ pop {r4, r5}
+ vrshrn.s32 ROW7L, q2, #11
+ vrshrn.s32 ROW3L, q5, #11
+ vrshrn.s32 ROW0L, q6, #11
+ vrshrn.s32 ROW4L, q3, #11
+
+ beq 3f /* Go to do some special handling for the sparse
+ right 4x8 half */
+
+ /* 1-D IDCT, pass 1, right 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vadd.s16 d10, ROW7R, ROW3R
+ vadd.s16 d8, ROW5R, ROW1R
+ /* Transpose left 4x8 half */
+ vtrn.16 ROW6L, ROW7L
+ vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
+ vmlal.s16 q6, d8, XFIX_1_175875602
+ vtrn.16 ROW2L, ROW3L
+ vmull.s16 q7, d10, XFIX_1_175875602
+ vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
+ vtrn.16 ROW0L, ROW1L
+ vsubl.s16 q3, ROW0R, ROW4R
+ vmull.s16 q2, ROW2R, XFIX_0_541196100
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+ vtrn.16 ROW4L, ROW5L
+ vmov q4, q6
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447
+ vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
+ vtrn.32 ROW1L, ROW3L
+ vshl.s32 q3, q3, #13
+ vmlsl.s16 q4, ROW1R, XFIX_0_899976223
+ vtrn.32 ROW4L, ROW6L
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vadd.s32 q1, q1, q6
+ vtrn.32 ROW0L, ROW2L
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223
+ vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
+ vrshrn.s32 ROW1R, q1, #11
+ vtrn.32 ROW5L, ROW7L
+ vsub.s32 q1, q1, q6
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+ vmlsl.s16 q5, ROW3R, XFIX_2_562915447
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100
+ vsub.s32 q3, q3, q2
+ vrshrn.s32 ROW6R, q1, #11
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vaddl.s16 q5, ROW0R, ROW4R
+ vrshrn.s32 ROW2R, q1, #11
+ vrshrn.s32 ROW5R, q3, #11
+ vshl.s32 q5, q5, #13
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vrshrn.s32 ROW7R, q2, #11
+ vrshrn.s32 ROW3R, q5, #11
+ vrshrn.s32 ROW0R, q6, #11
+ vrshrn.s32 ROW4R, q3, #11
+ /* Transpose right 4x8 half */
+ vtrn.16 ROW6R, ROW7R
+ vtrn.16 ROW2R, ROW3R
+ vtrn.16 ROW0R, ROW1R
+ vtrn.16 ROW4R, ROW5R
+ vtrn.32 ROW1R, ROW3R
+ vtrn.32 ROW4R, ROW6R
+ vtrn.32 ROW0R, ROW2R
+ vtrn.32 ROW5R, ROW7R
+
+1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
+ vmlal.s16 q6, ROW1L, XFIX_1_175875602
+ vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+ vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
+ vmlal.s16 q7, ROW3L, XFIX_1_175875602
+ vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+ vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
+ vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
+ vmov q4, q6
+ vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+ vshl.s32 q3, q3, #13
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vadd.s32 q1, q1, q6
+ vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+ vshrn.s32 ROW1L, q1, #16
+ vsub.s32 q1, q1, q6
+ vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+ vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW2L, q1, #16
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
+ vshl.s32 q5, q5, #13
+ vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW3L, q5, #16
+ vshrn.s32 ROW0L, q6, #16
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
+ /* 1-D IDCT, pass 2, right 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW5R, XFIX_1_175875602
+ vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
+ vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
+ vmull.s16 q7, ROW7R, XFIX_1_175875602
+ vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
+ vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
+ vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
+ vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+ vmov q4, q6
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
+ vshl.s32 q3, q3, #13
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vadd.s32 q1, q1, q6
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
+ vsub.s32 q1, q1, q6
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW6R, q1, #16
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
+ vshrn.s32 ROW5R, q3, #16
+ vshl.s32 q5, q5, #13
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW7R, q2, #16
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW4R, q3, #16
+
+2: /* Descale to 8-bit and range limit */
+ vqrshrn.s16 d16, q8, #2
+ vqrshrn.s16 d17, q9, #2
+ vqrshrn.s16 d18, q10, #2
+ vqrshrn.s16 d19, q11, #2
+ vpop {d8 - d15} /* restore Neon registers */
+ vqrshrn.s16 d20, q12, #2
+ /* Transpose the final 8-bit samples and do signed->unsigned conversion */
+ vtrn.16 q8, q9
+ vqrshrn.s16 d21, q13, #2
+ vqrshrn.s16 d22, q14, #2
+ vmov.u8 q0, #(CENTERJSAMPLE)
+ vqrshrn.s16 d23, q15, #2
+ vtrn.8 d16, d17
+ vtrn.8 d18, d19
+ vadd.u8 q8, q8, q0
+ vadd.u8 q9, q9, q0
+ vtrn.16 q10, q11
+ /* Store results to the output buffer */
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d16}, [TMP1]
+ vtrn.8 d20, d21
+ vst1.8 {d17}, [TMP2]
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d18}, [TMP1]
+ vadd.u8 q10, q10, q0
+ vst1.8 {d19}, [TMP2]
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ add TMP3, TMP3, OUTPUT_COL
+ add TMP4, TMP4, OUTPUT_COL
+ vtrn.8 d22, d23
+ vst1.8 {d20}, [TMP1]
+ vadd.u8 q11, q11, q0
+ vst1.8 {d21}, [TMP2]
+ vst1.8 {d22}, [TMP3]
+ vst1.8 {d23}, [TMP4]
+ bx lr
+
+3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+
+ /* Transpose left 4x8 half */
+ vtrn.16 ROW6L, ROW7L
+ vtrn.16 ROW2L, ROW3L
+ vtrn.16 ROW0L, ROW1L
+ vtrn.16 ROW4L, ROW5L
+ vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
+ vtrn.32 ROW1L, ROW3L
+ vtrn.32 ROW4L, ROW6L
+ vtrn.32 ROW0L, ROW2L
+ vtrn.32 ROW5L, ROW7L
+
+ cmp r0, #0
+ beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
+ pass */
+
+ /* Only row 0 is non-zero for the right 4x8 half */
+ vdup.s16 ROW1R, ROW0R[1]
+ vdup.s16 ROW2R, ROW0R[2]
+ vdup.s16 ROW3R, ROW0R[3]
+ vdup.s16 ROW4R, ROW0R[0]
+ vdup.s16 ROW5R, ROW0R[1]
+ vdup.s16 ROW6R, ROW0R[2]
+ vdup.s16 ROW7R, ROW0R[3]
+ vdup.s16 ROW0R, ROW0R[0]
+ b 1b /* Go to 'normal' second pass */
+
+4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW1L, XFIX_1_175875602
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+ vmull.s16 q7, ROW3L, XFIX_1_175875602
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
+ vshll.s16 q3, ROW0L, #13
+ vmov q4, q6
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+ vadd.s32 q1, q1, q6
+ vadd.s32 q6, q6, q6
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
+ vshrn.s32 ROW1L, q1, #16
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vshll.s16 q5, ROW0L, #13
+ vshrn.s32 ROW2L, q1, #16
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW3L, q5, #16
+ vshrn.s32 ROW0L, q6, #16
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
+ /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW5L, XFIX_1_175875602
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+ vmull.s16 q7, ROW7L, XFIX_1_175875602
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+ vmull.s16 q2, ROW6L, XFIX_0_541196100
+ vshll.s16 q3, ROW4L, #13
+ vmov q4, q6
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+ vadd.s32 q1, q1, q6
+ vadd.s32 q6, q6, q6
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW6R, q1, #16
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vshll.s16 q5, ROW4L, #13
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
+ vshrn.s32 ROW5R, q3, #16
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW7R, q2, #16
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW4R, q3, #16
+ b 2b /* Go to epilogue */
+
+ .unreq DCT_TABLE
+ .unreq COEF_BLOCK
+ .unreq OUTPUT_BUF
+ .unreq OUTPUT_COL
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq TMP4
+
+ .unreq ROW0L
+ .unreq ROW0R
+ .unreq ROW1L
+ .unreq ROW1R
+ .unreq ROW2L
+ .unreq ROW2R
+ .unreq ROW3L
+ .unreq ROW3R
+ .unreq ROW4L
+ .unreq ROW4R
+ .unreq ROW5L
+ .unreq ROW5R
+ .unreq ROW6L
+ .unreq ROW6R
+ .unreq ROW7L
+ .unreq ROW7R
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
+ * function from jidctfst.c
+ *
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
+ * But in Arm Neon case some extra additions are required because VQDMULH
+ * instruction can't handle the constants larger than 1. So the expressions
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
+ * which introduces an extra addition. Overall, there are 6 extra additions
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+ */
+
+#define XFIX_1_082392200 d0[0]
+#define XFIX_1_414213562 d0[1]
+#define XFIX_1_847759065 d0[2]
+#define XFIX_2_613125930 d0[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+ .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
+ .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
+ .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
+ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+
+asm_function jsimd_idct_ifast_neon
+
+ DCT_TABLE .req r0
+ COEF_BLOCK .req r1
+ OUTPUT_BUF .req r2
+ OUTPUT_COL .req r3
+ TMP1 .req r0
+ TMP2 .req r1
+ TMP3 .req r2
+ TMP4 .req ip
+
+ /* Load and dequantize coefficients into Neon registers
+ * with the following allocation:
+ * 0 1 2 3 | 4 5 6 7
+ * ---------+--------
+ * 0 | d16 | d17 ( q8 )
+ * 1 | d18 | d19 ( q9 )
+ * 2 | d20 | d21 ( q10 )
+ * 3 | d22 | d23 ( q11 )
+ * 4 | d24 | d25 ( q12 )
+ * 5 | d26 | d27 ( q13 )
+ * 6 | d28 | d29 ( q14 )
+ * 7 | d30 | d31 ( q15 )
+ */
+ adr ip, jsimd_idct_ifast_neon_consts
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+ vmul.s16 q8, q8, q0
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+ vmul.s16 q9, q9, q1
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+ vmul.s16 q10, q10, q2
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+ vmul.s16 q11, q11, q3
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+ vmul.s16 q12, q12, q0
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+ vmul.s16 q14, q14, q2
+ vmul.s16 q13, q13, q1
+ vld1.16 {d0}, [ip, :64] /* load constants */
+ vmul.s16 q15, q15, q3
+ vpush {d8 - d13} /* save Neon registers */
+ /* 1-D IDCT, pass 1 */
+ vsub.s16 q2, q10, q14
+ vadd.s16 q14, q10, q14
+ vsub.s16 q1, q11, q13
+ vadd.s16 q13, q11, q13
+ vsub.s16 q5, q9, q15
+ vadd.s16 q15, q9, q15
+ vqdmulh.s16 q4, q2, XFIX_1_414213562
+ vqdmulh.s16 q6, q1, XFIX_2_613125930
+ vadd.s16 q3, q1, q1
+ vsub.s16 q1, q5, q1
+ vadd.s16 q10, q2, q4
+ vqdmulh.s16 q4, q1, XFIX_1_847759065
+ vsub.s16 q2, q15, q13
+ vadd.s16 q3, q3, q6
+ vqdmulh.s16 q6, q2, XFIX_1_414213562
+ vadd.s16 q1, q1, q4
+ vqdmulh.s16 q4, q5, XFIX_1_082392200
+ vsub.s16 q10, q10, q14
+ vadd.s16 q2, q2, q6
+ vsub.s16 q6, q8, q12
+ vadd.s16 q12, q8, q12
+ vadd.s16 q9, q5, q4
+ vadd.s16 q5, q6, q10
+ vsub.s16 q10, q6, q10
+ vadd.s16 q6, q15, q13
+ vadd.s16 q8, q12, q14
+ vsub.s16 q3, q6, q3
+ vsub.s16 q12, q12, q14
+ vsub.s16 q3, q3, q1
+ vsub.s16 q1, q9, q1
+ vadd.s16 q2, q3, q2
+ vsub.s16 q15, q8, q6
+ vadd.s16 q1, q1, q2
+ vadd.s16 q8, q8, q6
+ vadd.s16 q14, q5, q3
+ vsub.s16 q9, q5, q3
+ vsub.s16 q13, q10, q2
+ vadd.s16 q10, q10, q2
+ /* Transpose */
+ vtrn.16 q8, q9
+ vsub.s16 q11, q12, q1
+ vtrn.16 q14, q15
+ vadd.s16 q12, q12, q1
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q8, q10
+ vtrn.32 q13, q15
+ vswp d28, d21
+ vswp d26, d19
+ /* 1-D IDCT, pass 2 */
+ vsub.s16 q2, q10, q14
+ vswp d30, d23
+ vadd.s16 q14, q10, q14
+ vswp d24, d17
+ vsub.s16 q1, q11, q13
+ vadd.s16 q13, q11, q13
+ vsub.s16 q5, q9, q15
+ vadd.s16 q15, q9, q15
+ vqdmulh.s16 q4, q2, XFIX_1_414213562
+ vqdmulh.s16 q6, q1, XFIX_2_613125930
+ vadd.s16 q3, q1, q1
+ vsub.s16 q1, q5, q1
+ vadd.s16 q10, q2, q4
+ vqdmulh.s16 q4, q1, XFIX_1_847759065
+ vsub.s16 q2, q15, q13
+ vadd.s16 q3, q3, q6
+ vqdmulh.s16 q6, q2, XFIX_1_414213562
+ vadd.s16 q1, q1, q4
+ vqdmulh.s16 q4, q5, XFIX_1_082392200
+ vsub.s16 q10, q10, q14
+ vadd.s16 q2, q2, q6
+ vsub.s16 q6, q8, q12
+ vadd.s16 q12, q8, q12
+ vadd.s16 q9, q5, q4
+ vadd.s16 q5, q6, q10
+ vsub.s16 q10, q6, q10
+ vadd.s16 q6, q15, q13
+ vadd.s16 q8, q12, q14
+ vsub.s16 q3, q6, q3
+ vsub.s16 q12, q12, q14
+ vsub.s16 q3, q3, q1
+ vsub.s16 q1, q9, q1
+ vadd.s16 q2, q3, q2
+ vsub.s16 q15, q8, q6
+ vadd.s16 q1, q1, q2
+ vadd.s16 q8, q8, q6
+ vadd.s16 q14, q5, q3
+ vsub.s16 q9, q5, q3
+ vsub.s16 q13, q10, q2
+ vpop {d8 - d13} /* restore Neon registers */
+ vadd.s16 q10, q10, q2
+ vsub.s16 q11, q12, q1
+ vadd.s16 q12, q12, q1
+ /* Descale to 8-bit and range limit */
+ vmov.u8 q0, #0x80
+ vqshrn.s16 d16, q8, #5
+ vqshrn.s16 d17, q9, #5
+ vqshrn.s16 d18, q10, #5
+ vqshrn.s16 d19, q11, #5
+ vqshrn.s16 d20, q12, #5
+ vqshrn.s16 d21, q13, #5
+ vqshrn.s16 d22, q14, #5
+ vqshrn.s16 d23, q15, #5
+ vadd.u8 q8, q8, q0
+ vadd.u8 q9, q9, q0
+ vadd.u8 q10, q10, q0
+ vadd.u8 q11, q11, q0
+ /* Transpose the final 8-bit samples */
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.8 d16, d17
+ vtrn.8 d18, d19
+ /* Store results to the output buffer */
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d16}, [TMP1]
+ vst1.8 {d17}, [TMP2]
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d18}, [TMP1]
+ vtrn.8 d20, d21
+ vst1.8 {d19}, [TMP2]
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ add TMP3, TMP3, OUTPUT_COL
+ add TMP4, TMP4, OUTPUT_COL
+ vst1.8 {d20}, [TMP1]
+ vtrn.8 d22, d23
+ vst1.8 {d21}, [TMP2]
+ vst1.8 {d22}, [TMP3]
+ vst1.8 {d23}, [TMP4]
+ bx lr
+
+ .unreq DCT_TABLE
+ .unreq COEF_BLOCK
+ .unreq OUTPUT_BUF
+ .unreq OUTPUT_COL
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq TMP4
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+ .if \size == 8
+ vst1.8 {d20}, [Y]!
+ vst1.8 {d21}, [U]!
+ vst1.8 {d22}, [V]!
+ .elseif \size == 4
+ vst1.8 {d20[0]}, [Y]!
+ vst1.8 {d20[1]}, [Y]!
+ vst1.8 {d20[2]}, [Y]!
+ vst1.8 {d20[3]}, [Y]!
+ vst1.8 {d21[0]}, [U]!
+ vst1.8 {d21[1]}, [U]!
+ vst1.8 {d21[2]}, [U]!
+ vst1.8 {d21[3]}, [U]!
+ vst1.8 {d22[0]}, [V]!
+ vst1.8 {d22[1]}, [V]!
+ vst1.8 {d22[2]}, [V]!
+ vst1.8 {d22[3]}, [V]!
+ .elseif \size == 2
+ vst1.8 {d20[4]}, [Y]!
+ vst1.8 {d20[5]}, [Y]!
+ vst1.8 {d21[4]}, [U]!
+ vst1.8 {d21[5]}, [U]!
+ vst1.8 {d22[4]}, [V]!
+ vst1.8 {d22[5]}, [V]!
+ .elseif \size == 1
+ vst1.8 {d20[6]}, [Y]!
+ vst1.8 {d21[6]}, [U]!
+ vst1.8 {d22[6]}, [V]!
+ .else
+ .error unsupported macroblock size
+ .endif
+.endm
+
+.macro do_load bpp, size
+ .if \bpp == 24
+ .if \size == 8
+ vld3.8 {d10, d11, d12}, [RGB]!
+ pld [RGB, #128]
+ .elseif \size == 4
+ vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
+ vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
+ vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
+ vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
+ .elseif \size == 2
+ vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
+ vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
+ .elseif \size == 1
+ vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
+ .else
+ .error unsupported macroblock size
+ .endif
+ .elseif \bpp == 32
+ .if \size == 8
+ vld4.8 {d10, d11, d12, d13}, [RGB]!
+ pld [RGB, #128]
+ .elseif \size == 4
+ vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+ vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+ vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+ vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+ .elseif \size == 2
+ vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+ vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+ .elseif \size == 1
+ vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+ .else
+ .error unsupported macroblock size
+ .endif
+ .else
+ .error unsupported bpp
+ .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
+ vmull.u16 q7, d4, d0[0]
+ vmlal.u16 q7, d6, d0[1]
+ vmlal.u16 q7, d8, d0[2]
+ vmull.u16 q8, d5, d0[0]
+ vmlal.u16 q8, d7, d0[1]
+ vmlal.u16 q8, d9, d0[2]
+ vrev64.32 q9, q1
+ vrev64.32 q13, q1
+ vmlsl.u16 q9, d4, d0[3]
+ vmlsl.u16 q9, d6, d1[0]
+ vmlal.u16 q9, d8, d1[1]
+ vmlsl.u16 q13, d5, d0[3]
+ vmlsl.u16 q13, d7, d1[0]
+ vmlal.u16 q13, d9, d1[1]
+ vrev64.32 q14, q1
+ vrev64.32 q15, q1
+ vmlal.u16 q14, d4, d1[1]
+ vmlsl.u16 q14, d6, d1[2]
+ vmlsl.u16 q14, d8, d1[3]
+ vmlal.u16 q15, d5, d1[1]
+ vmlsl.u16 q15, d7, d1[2]
+ vmlsl.u16 q15, d9, d1[3]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+ vrshrn.u32 d20, q7, #16
+ vrshrn.u32 d21, q8, #16
+ vshrn.u32 d22, q9, #16
+ vshrn.u32 d23, q13, #16
+ vshrn.u32 d24, q14, #16
+ vshrn.u32 d25, q15, #16
+ vmovn.u16 d20, q10 /* d20 = y */
+ vmovn.u16 d21, q11 /* d21 = u */
+ vmovn.u16 d22, q12 /* d22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+ do_rgb_to_yuv_stage1
+ do_rgb_to_yuv_stage2
+.endm
+
+.macro do_rgb_to_yuv_stage2_store_load_stage1
+ vrshrn.u32 d20, q7, #16
+ vrshrn.u32 d21, q8, #16
+ vshrn.u32 d22, q9, #16
+ vrev64.32 q9, q1
+ vshrn.u32 d23, q13, #16
+ vrev64.32 q13, q1
+ vshrn.u32 d24, q14, #16
+ vshrn.u32 d25, q15, #16
+ do_load \bpp, 8
+ vmovn.u16 d20, q10 /* d20 = y */
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
+ vmovn.u16 d21, q11 /* d21 = u */
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
+ vmovn.u16 d22, q12 /* d22 = v */
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
+ vmull.u16 q7, d4, d0[0]
+ vmlal.u16 q7, d6, d0[1]
+ vmlal.u16 q7, d8, d0[2]
+ vst1.8 {d20}, [Y]!
+ vmull.u16 q8, d5, d0[0]
+ vmlal.u16 q8, d7, d0[1]
+ vmlal.u16 q8, d9, d0[2]
+ vmlsl.u16 q9, d4, d0[3]
+ vmlsl.u16 q9, d6, d1[0]
+ vmlal.u16 q9, d8, d1[1]
+ vst1.8 {d21}, [U]!
+ vmlsl.u16 q13, d5, d0[3]
+ vmlsl.u16 q13, d7, d1[0]
+ vmlal.u16 q13, d9, d1[1]
+ vrev64.32 q14, q1
+ vrev64.32 q15, q1
+ vmlal.u16 q14, d4, d1[1]
+ vmlsl.u16 q14, d6, d1[2]
+ vmlsl.u16 q14, d8, d1[3]
+ vst1.8 {d22}, [V]!
+ vmlal.u16 q15, d5, d1[1]
+ vmlsl.u16 q15, d7, d1[2]
+ vmlsl.u16 q15, d9, d1[3]
+.endm
+
+.balign 16
+jsimd_\colorid\()_ycc_neon_consts:
+ .short 19595, 38470, 7471, 11059
+ .short 21709, 32768, 27439, 5329
+ .short 32767, 128, 32767, 128
+ .short 32767, 128, 32767, 128
+
+asm_function jsimd_\colorid\()_ycc_convert_neon
+ OUTPUT_WIDTH .req r0
+ INPUT_BUF .req r1
+ OUTPUT_BUF .req r2
+ OUTPUT_ROW .req r3
+ NUM_ROWS .req r4
+
+ OUTPUT_BUF0 .req r5
+ OUTPUT_BUF1 .req r6
+ OUTPUT_BUF2 .req OUTPUT_BUF
+
+ RGB .req r7
+ Y .req r8
+ U .req r9
+ V .req r10
+ N .req ip
+
+ /* Load constants to d0, d1, d2, d3 */
+ adr ip, jsimd_\colorid\()_ycc_neon_consts
+ vld1.16 {d0, d1, d2, d3}, [ip, :128]
+
+ /* Save Arm registers and handle input arguments */
+ push {r4, r5, r6, r7, r8, r9, r10, lr}
+ ldr NUM_ROWS, [sp, #(4 * 8)]
+ ldr OUTPUT_BUF0, [OUTPUT_BUF]
+ ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
+ ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
+ .unreq OUTPUT_BUF
+
+ /* Save Neon registers */
+ vpush {d8 - d15}
+
+ /* Outer loop over scanlines */
+ cmp NUM_ROWS, #1
+ blt 9f
+0:
+ ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
+ ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
+ mov N, OUTPUT_WIDTH
+ ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
+ add OUTPUT_ROW, OUTPUT_ROW, #1
+ ldr RGB, [INPUT_BUF], #4
+
+ /* Inner loop over pixels */
+ subs N, N, #8
+ blt 3f
+ do_load \bpp, 8
+ do_rgb_to_yuv_stage1
+ subs N, N, #8
+ blt 2f
+1:
+ do_rgb_to_yuv_stage2_store_load_stage1
+ subs N, N, #8
+ bge 1b
+2:
+ do_rgb_to_yuv_stage2
+ do_store 8
+ tst N, #7
+ beq 8f
+3:
+ tst N, #4
+ beq 3f
+ do_load \bpp, 4
+3:
+ tst N, #2
+ beq 4f
+ do_load \bpp, 2
+4:
+ tst N, #1
+ beq 5f
+ do_load \bpp, 1
+5:
+ do_rgb_to_yuv
+ tst N, #4
+ beq 6f
+ do_store 4
+6:
+ tst N, #2
+ beq 7f
+ do_store 2
+7:
+ tst N, #1
+ beq 8f
+ do_store 1
+8:
+ subs NUM_ROWS, NUM_ROWS, #1
+ bgt 0b
+9:
+ /* Restore all registers and return */
+ vpop {d8 - d15}
+ pop {r4, r5, r6, r7, r8, r9, r10, pc}
+
+ .unreq OUTPUT_WIDTH
+ .unreq OUTPUT_ROW
+ .unreq INPUT_BUF
+ .unreq NUM_ROWS
+ .unreq OUTPUT_BUF0
+ .unreq OUTPUT_BUF1
+ .unreq OUTPUT_BUF2
+ .unreq RGB
+ .unreq Y
+ .unreq U
+ .unreq V
+ .unreq N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R G B */
+generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
+
+.purgem do_load
+.purgem do_store
diff --git a/media/libjpeg/simd/arm/aarch64/jccolext-neon.c b/media/libjpeg/simd/arm/aarch64/jccolext-neon.c
new file mode 100644
index 0000000000..37130c225e
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jccolext-neon.c
@@ -0,0 +1,316 @@
+/*
+ * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * 0.16874695 = 11059 * 2^-16
+ * 0.33125305 = 21709 * 2^-16
+ * 0.50000000 = 32768 * 2^-16
+ * 0.41868592 = 27439 * 2^-16
+ * 0.08131409 = 5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ /* Pointer to RGB(X/A) input data */
+ JSAMPROW inptr;
+ /* Pointers to Y, Cb, and Cr output data */
+ JSAMPROW outptr0, outptr1, outptr2;
+ /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+ ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+ /* Set up conversion constants. */
+ const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+ const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+ uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+ uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+ uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_ll = scaled_128_5;
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+ cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+ uint32x4_t cb_lh = scaled_128_5;
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+ cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+ uint32x4_t cb_hl = scaled_128_5;
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+ cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+ uint32x4_t cb_hh = scaled_128_5;
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+ cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_ll = scaled_128_5;
+ cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+ uint32x4_t cr_lh = scaled_128_5;
+ cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+ uint32x4_t cr_hl = scaled_128_5;
+ cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+ uint32x4_t cr_hh = scaled_128_5;
+ cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+ vshrn_n_u32(cb_lh, 16));
+ uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+ vshrn_n_u32(cb_hh, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+ vshrn_n_u32(cr_lh, 16));
+ uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+ vshrn_n_u32(cr_hh, 16));
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+ vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+ vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+ /* Increment pointers. */
+ inptr += (16 * RGB_PIXELSIZE);
+ outptr0 += 16;
+ outptr1 += 16;
+ outptr2 += 16;
+ }
+
+ if (cols_remaining > 8) {
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 16) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+ uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+ uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+ uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_ll = scaled_128_5;
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+ cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+ uint32x4_t cb_lh = scaled_128_5;
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+ cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+ uint32x4_t cb_hl = scaled_128_5;
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+ cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+ uint32x4_t cb_hh = scaled_128_5;
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+ cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_ll = scaled_128_5;
+ cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+ uint32x4_t cr_lh = scaled_128_5;
+ cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+ uint32x4_t cr_hl = scaled_128_5;
+ cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+ uint32x4_t cr_hh = scaled_128_5;
+ cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+ vshrn_n_u32(cb_lh, 16));
+ uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+ vshrn_n_u32(cb_hh, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+ vshrn_n_u32(cr_lh, 16));
+ uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+ vshrn_n_u32(cr_hh, 16));
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+ vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+ vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+ } else if (cols_remaining > 0) {
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 8) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+ uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+ uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+ uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+ uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+ y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
+ y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
+ uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
+ y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
+ y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_l = scaled_128_5;
+ cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
+ cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
+ cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
+ uint32x4_t cb_h = scaled_128_5;
+ cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
+ cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
+ cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_l = scaled_128_5;
+ cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
+ cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
+ cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
+ uint32x4_t cr_h = scaled_128_5;
+ cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
+ cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
+ cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
+ vrshrn_n_u32(y_h, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
+ vshrn_n_u32(cb_h, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
+ vshrn_n_u32(cr_h, 16));
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1_u8(outptr0, vmovn_u16(y_u16));
+ vst1_u8(outptr1, vmovn_u16(cb_u16));
+ vst1_u8(outptr2, vmovn_u16(cr_u16));
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jchuff-neon.c b/media/libjpeg/simd/arm/aarch64/jchuff-neon.c
new file mode 100644
index 0000000000..607a116070
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jchuff-neon.c
@@ -0,0 +1,411 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, 2022, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../align.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
+ 0, 1, 2, 3, 16, 17, 32, 33,
+ 18, 19, 4, 5, 6, 7, 20, 21,
+ 34, 35, 48, 49, 255, 255, 50, 51,
+ 36, 37, 22, 23, 8, 9, 10, 11,
+ 255, 255, 6, 7, 20, 21, 34, 35,
+ 48, 49, 255, 255, 50, 51, 36, 37,
+ 54, 55, 40, 41, 26, 27, 12, 13,
+ 14, 15, 28, 29, 42, 43, 56, 57,
+ 6, 7, 20, 21, 34, 35, 48, 49,
+ 50, 51, 36, 37, 22, 23, 8, 9,
+ 26, 27, 12, 13, 255, 255, 14, 15,
+ 28, 29, 42, 43, 56, 57, 255, 255,
+ 52, 53, 54, 55, 40, 41, 26, 27,
+ 12, 13, 255, 255, 14, 15, 28, 29,
+ 26, 27, 40, 41, 42, 43, 28, 29,
+ 14, 15, 30, 31, 44, 45, 46, 47
+};
+
+/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
+ * address warning because the macro sometimes writes a 64-bit value to a
+ * non-64-bit-aligned address. That behavior is technically undefined per
+ * the C specification, but it is supported by the AArch64 architecture and
+ * compilers.
+ */
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("alignment")))
+#endif
+#endif
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+ JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ uint16_t block_diff[DCTSIZE2];
+
+ /* Load lookup table indices for rows of zig-zag ordering. */
+#ifdef HAVE_VLD1Q_U8_X4
+ const uint8x16x4_t idx_rows_0123 =
+ vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
+ const uint8x16x4_t idx_rows_4567 =
+ vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
+#else
+ /* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
+ const uint8x16x4_t idx_rows_0123 = { {
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
+ } };
+ const uint8x16x4_t idx_rows_4567 = { {
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
+ } };
+#endif
+
+ /* Load 8x8 block of DCT coefficients. */
+#ifdef HAVE_VLD1Q_U8_X4
+ const int8x16x4_t tbl_rows_0123 =
+ vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
+ const int8x16x4_t tbl_rows_4567 =
+ vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
+#else
+ const int8x16x4_t tbl_rows_0123 = { {
+ vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
+ } };
+ const int8x16x4_t tbl_rows_4567 = { {
+ vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
+ } };
+#endif
+
+ /* Initialise extra lookup tables. */
+ const int8x16x4_t tbl_rows_2345 = { {
+ tbl_rows_0123.val[2], tbl_rows_0123.val[3],
+ tbl_rows_4567.val[0], tbl_rows_4567.val[1]
+ } };
+ const int8x16x3_t tbl_rows_567 =
+ { { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
+
+ /* Shuffle coefficients into zig-zag order. */
+ int16x8_t row0 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
+ int16x8_t row1 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
+ int16x8_t row2 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
+ int16x8_t row3 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
+ int16x8_t row4 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
+ int16x8_t row5 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
+ int16x8_t row6 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
+ int16x8_t row7 =
+ vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
+
+ /* Compute DC coefficient difference value (F.1.1.5.1). */
+ row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
+ /* Initialize AC coefficient lanes not reachable by lookup tables. */
+ row1 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
+ 0), row1, 2);
+ row2 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+ 4), row2, 0);
+ row2 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+ 0), row2, 5);
+ row5 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+ 7), row5, 2);
+ row5 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+ 3), row5, 7);
+ row6 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
+ 7), row6, 5);
+
+ /* DCT block is now in zig-zag order; start Huffman encoding process. */
+
+ /* Construct bitmap to accelerate encoding of AC coefficients. A set bit
+ * means that the corresponding coefficient != 0.
+ */
+ uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
+ uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
+ uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
+ uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
+ uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
+ uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
+ uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
+ uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
+
+ uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
+ vreinterpretq_u8_u16(row0_ne_0));
+ uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
+ vreinterpretq_u8_u16(row2_ne_0));
+ uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
+ vreinterpretq_u8_u16(row4_ne_0));
+ uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
+ vreinterpretq_u8_u16(row6_ne_0));
+
+ /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+ const uint8x16_t bitmap_mask =
+ vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
+
+ uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
+ uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
+ uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
+ uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
+
+ uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
+ uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
+ uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
+ bitmap_rows_3210);
+ uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
+ vget_high_u8(bitmap_rows_76543210));
+
+ /* Shift left to remove DC bit. */
+ bitmap_all =
+ vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
+ /* Count bits set (number of non-zero coefficients) in bitmap. */
+ unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
+ /* Move bitmap to 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+ /* Set up state and bit buffer for output bitstream. */
+ working_state *state_ptr = (working_state *)state;
+ int free_bits = state_ptr->cur.free_bits;
+ size_t put_buffer = state_ptr->cur.put_buffer;
+
+ /* Encode DC coefficient. */
+
+ /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+ int16x8_t abs_row0 = vabsq_s16(row0);
+ int16x8_t row0_lz = vclzq_s16(abs_row0);
+ uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
+ uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+ /* Find nbits required to specify sign and amplitude of coefficient. */
+ unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
+ unsigned int nbits = 16 - lz;
+ /* Emit Huffman-coded symbol and additional diff bits. */
+ unsigned int diff = vgetq_lane_u16(row0_diff, 0);
+ PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+ /* Encode AC coefficients. */
+
+ unsigned int r = 0; /* r = run length of zeros */
+ unsigned int i = 1; /* i = number of coefficients encoded */
+ /* Code and size information for a run length of 16 zero coefficients */
+ const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+ const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+ /* The most efficient method of computing nbits and diff depends on the
+ * number of non-zero coefficients. If the bitmap is not too sparse (> 8
+ * non-zero AC coefficients), it is beneficial to do all of the work using
+ * Neon; else we do some of the work using Neon and the rest on demand using
+ * scalar code.
+ */
+ if (non_zero_coefficients > 8) {
+ uint8_t block_nbits[DCTSIZE2];
+
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
+ int16x8_t row1_lz = vclzq_s16(abs_row1);
+ int16x8_t row2_lz = vclzq_s16(abs_row2);
+ int16x8_t row3_lz = vclzq_s16(abs_row3);
+ int16x8_t row4_lz = vclzq_s16(abs_row4);
+ int16x8_t row5_lz = vclzq_s16(abs_row5);
+ int16x8_t row6_lz = vclzq_s16(abs_row6);
+ int16x8_t row7_lz = vclzq_s16(abs_row7);
+ /* Narrow leading zero count to 8 bits. */
+ uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
+ vreinterpretq_u8_s16(row1_lz));
+ uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
+ vreinterpretq_u8_s16(row3_lz));
+ uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
+ vreinterpretq_u8_s16(row5_lz));
+ uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
+ vreinterpretq_u8_s16(row7_lz));
+ /* Compute nbits needed to specify magnitude of each coefficient. */
+ uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
+ uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
+ uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
+ uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
+ /* Store nbits. */
+ vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
+ vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
+ vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
+ vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
+ /* Mask bits not required to specify sign and amplitude of diff. */
+ uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
+ uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
+ uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
+ uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
+ uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
+ uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
+ uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
+ /* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
+ uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+ row1_mask);
+ uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+ row2_mask);
+ uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+ row3_mask);
+ uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+ row4_mask);
+ uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+ row5_mask);
+ uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+ row6_mask);
+ uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+ row7_mask);
+ /* Store diff bits. */
+ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+ vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+ vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+ vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+ vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+ vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+ vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+ vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+ while (bitmap != 0) {
+ r = BUILTIN_CLZLL(bitmap);
+ i += r;
+ bitmap <<= r;
+ nbits = block_nbits[i];
+ diff = block_diff[i];
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ i++;
+ bitmap <<= 1;
+ }
+ } else if (bitmap != 0) {
+ uint16_t block_abs[DCTSIZE2];
+ /* Compute and store absolute value of coefficients. */
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
+ vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
+ vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
+ vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
+ vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
+ vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
+ vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
+ vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
+ vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
+ /* Compute diff bits (without nbits mask) and store. */
+ uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+ vcltzq_s16(row1));
+ uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+ vcltzq_s16(row2));
+ uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+ vcltzq_s16(row3));
+ uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+ vcltzq_s16(row4));
+ uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+ vcltzq_s16(row5));
+ uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+ vcltzq_s16(row6));
+ uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+ vcltzq_s16(row7));
+ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+ vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+ vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+ vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+ vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+ vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+ vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+ vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+ /* Same as above but must mask diff bits and compute nbits on demand. */
+ while (bitmap != 0) {
+ r = BUILTIN_CLZLL(bitmap);
+ i += r;
+ bitmap <<= r;
+ lz = BUILTIN_CLZ(block_abs[i]);
+ nbits = 32 - lz;
+ diff = ((unsigned int)block_diff[i] << lz) >> lz;
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ i++;
+ bitmap <<= 1;
+ }
+ }
+
+ /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+ * The value of RS for the EOB code is 0.
+ */
+ if (i != 64) {
+ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+ }
+
+ state_ptr->cur.put_buffer = put_buffer;
+ state_ptr->cur.free_bits = free_bits;
+
+ return buffer;
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jsimd.c b/media/libjpeg/simd/arm/aarch64/jsimd.c
new file mode 100644
index 0000000000..358e1597b1
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jsimd.c
@@ -0,0 +1,1053 @@
+/*
+ * jsimd_arm64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <ctype.h>
+
+#define JSIMD_FASTLD3 1
+#define JSIMD_FASTST3 2
+#define JSIMD_FASTTBL 4
+
+static THREAD_LOCAL unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_features = JSIMD_FASTLD3 |
+ JSIMD_FASTST3 | JSIMD_FASTTBL;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_cpuinfo(char *buffer, const char *field, char *value)
+{
+ char *p;
+
+ if (*value == 0)
+ return 0;
+ if (strncmp(buffer, field, strlen(field)) != 0)
+ return 0;
+ buffer += strlen(field);
+ while (isspace(*buffer))
+ buffer++;
+
+ /* Check if 'value' is present in the buffer as a separate word */
+ while ((p = strstr(buffer, value))) {
+ if (p > buffer && !isspace(*(p - 1))) {
+ buffer++;
+ continue;
+ }
+ p += strlen(value);
+ if (*p != 0 && !isspace(*p)) {
+ buffer++;
+ continue;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+ char *buffer = (char *)malloc(bufsize);
+ FILE *fd;
+
+ if (!buffer)
+ return 0;
+
+ fd = fopen("/proc/cpuinfo", "r");
+ if (fd) {
+ while (fgets(buffer, bufsize, fd)) {
+ if (!strchr(buffer, '\n') && !feof(fd)) {
+ /* "impossible" happened - insufficient size of the buffer! */
+ fclose(fd);
+ free(buffer);
+ return 0;
+ }
+ if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+ check_cpuinfo(buffer, "CPU part", "0xd07"))
+ /* The Cortex-A53 has a slow tbl implementation. We can gain a few
+ percent speedup by disabling the use of that instruction. The
+ speedup on Cortex-A57 is more subtle but still measurable. */
+ simd_features &= ~JSIMD_FASTTBL;
+ else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+ /* The SIMD version of Huffman encoding is slower than the C version on
+ Cavium ThunderX. Also, ld3 and st3 are abyssmally slow on that
+ CPU. */
+ simd_huffman = simd_features = 0;
+ }
+ fclose(fd);
+ }
+ free(buffer);
+ return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+
+/*
+ * Armv8 architectures support Neon extensions by default.
+ * It is no longer optional as it was with Armv7.
+ */
+
+
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char env[2] = { 0 };
+#endif
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+ int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+ simd_support |= JSIMD_NEON;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+ while (!parse_proc_cpuinfo(bufsize)) {
+ bufsize *= 2;
+ if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+ break;
+ }
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+ simd_support = JSIMD_NEON;
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+ simd_support = 0;
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+ simd_huffman = 0;
+ if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "1"))
+ simd_features |= JSIMD_FASTLD3;
+ if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "0"))
+ simd_features &= ~JSIMD_FASTLD3;
+ if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "1"))
+ simd_features |= JSIMD_FASTST3;
+ if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "0"))
+ simd_features &= ~JSIMD_FASTST3;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTLD3)
+#endif
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_ycc_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTLD3)
+#endif
+ neonfct = jsimd_extbgr_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+#endif
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_ycc_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_ycc_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_ycc_convert_neon;
+ break;
+ default:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTLD3)
+#endif
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_gray_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_extbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_gray_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_gray_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTST3)
+#endif
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_ycc_extrgbx_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTST3)
+#endif
+ neonfct = jsimd_ycc_extbgr_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
+#endif
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_ycc_extbgrx_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_ycc_extxbgr_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_ycc_extxrgb_convert_neon;
+ break;
+ default:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTST3)
+#endif
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+ output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON && simd_huffman)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTTBL)
+#endif
+ return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+#ifndef NEON_INTRINSICS
+ else
+ return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+ last_dc_val, dctbl, actbl);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 8)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 8)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+ jpeg_natural_order_start,
+ Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jsimd_neon.S b/media/libjpeg/simd/arm/aarch64/jsimd_neon.S
new file mode 100644
index 0000000000..738a4f0658
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jsimd_neon.S
@@ -0,0 +1,2254 @@
+/*
+ * Armv8 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ * All Rights Reserved.
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved.
+ * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Copyright (C) 2014-2016, 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
+ * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
+#endif
+
+#if defined(__APPLE__)
+.section __DATA, __const
+#elif defined(_WIN32)
+.section .rdata
+#else
+.section .rodata, "a", %progbits
+#endif
+
+/* Constants for jsimd_idct_islow_neon() */
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_idct_islow_neon_consts:
+ .short F_0_298
+ .short -F_0_390
+ .short F_0_541
+ .short F_0_765
+ .short - F_0_899
+ .short F_1_175
+ .short F_1_501
+ .short - F_1_847
+ .short - F_1_961
+ .short F_2_053
+ .short - F_2_562
+ .short F_3_072
+ .short 0 /* padding */
+ .short 0
+ .short 0
+ .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_ycc_*_neon() */
+
+.balign 16
+Ljsimd_ycc_rgb_neon_consts:
+ .short 0, 0, 0, 0
+ .short 22971, -11277, -23401, 29033
+ .short -128, -128, -128, -128
+ .short -128, -128, -128, -128
+
+/* Constants for jsimd_*_ycc_neon() */
+
+.balign 16
+Ljsimd_rgb_ycc_neon_consts:
+ .short 19595, 38470, 7471, 11059
+ .short 21709, 32768, 27439, 5329
+ .short 32767, 128, 32767, 128
+ .short 32767, 128, 32767, 128
+
+/* Constants for jsimd_fdct_islow_neon() */
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+ .short F_0_298
+ .short -F_0_390
+ .short F_0_541
+ .short F_0_765
+ .short - F_0_899
+ .short F_1_175
+ .short F_1_501
+ .short - F_1_847
+ .short - F_1_961
+ .short F_2_053
+ .short - F_2_562
+ .short F_3_072
+ .short 0 /* padding */
+ .short 0
+ .short 0
+ .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_huff_encode_one_block_neon() */
+
+.balign 16
+Ljsimd_huff_encode_one_block_neon_consts:
+ .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+ .byte 0, 1, 2, 3, 16, 17, 32, 33, \
+ 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
+ .byte 34, 35, 48, 49, 255, 255, 50, 51, \
+ 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
+ .byte 8, 9, 22, 23, 36, 37, 50, 51, \
+ 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
+ .byte 54, 55, 40, 41, 26, 27, 12, 13, \
+ 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
+ .byte 6, 7, 20, 21, 34, 35, 48, 49, \
+ 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
+ .byte 42, 43, 28, 29, 14, 15, 30, 31, \
+ 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
+ .byte 255, 255, 255, 255, 56, 57, 42, 43, \
+ 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
+ .byte 26, 27, 40, 41, 42, 43, 28, 29, \
+ 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
+ .byte 255, 255, 255, 255, 0, 1, 255, 255, \
+ 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, \
+ 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, \
+ 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
+ .byte 4, 5, 6, 7, 255, 255, 255, 255, \
+ 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
+
+.text
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+ .private_extern _\fname
+ .globl _\fname
+_\fname:
+#else
+ .global \fname
+#ifdef __ELF__
+ .hidden \fname
+ .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+/* Get symbol location */
+.macro get_symbol_loc reg, symbol
+#ifdef __APPLE__
+ adrp \reg, \symbol@PAGE
+ add \reg, \reg, \symbol@PAGEOFF
+#else
+ adrp \reg, \symbol
+ add \reg, \reg, :lo12:\symbol
+#endif
+.endm
+
+.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
+ trn1 \t0\().8h, \l0\().8h, \l1\().8h
+ trn1 \t1\().8h, \l2\().8h, \l3\().8h
+ trn1 \t2\().8h, \l4\().8h, \l5\().8h
+ trn1 \t3\().8h, \l6\().8h, \l7\().8h
+ trn2 \l1\().8h, \l0\().8h, \l1\().8h
+ trn2 \l3\().8h, \l2\().8h, \l3\().8h
+ trn2 \l5\().8h, \l4\().8h, \l5\().8h
+ trn2 \l7\().8h, \l6\().8h, \l7\().8h
+
+ trn1 \l4\().4s, \t2\().4s, \t3\().4s
+ trn2 \t3\().4s, \t2\().4s, \t3\().4s
+ trn1 \t2\().4s, \t0\().4s, \t1\().4s
+ trn2 \l2\().4s, \t0\().4s, \t1\().4s
+ trn1 \t0\().4s, \l1\().4s, \l3\().4s
+ trn2 \l3\().4s, \l1\().4s, \l3\().4s
+ trn2 \t1\().4s, \l5\().4s, \l7\().4s
+ trn1 \l5\().4s, \l5\().4s, \l7\().4s
+
+ trn2 \l6\().2d, \l2\().2d, \t3\().2d
+ trn1 \l0\().2d, \t2\().2d, \l4\().2d
+ trn1 \l1\().2d, \t0\().2d, \l5\().2d
+ trn2 \l7\().2d, \l3\().2d, \t1\().2d
+ trn1 \l2\().2d, \l2\().2d, \t3\().2d
+ trn2 \l4\().2d, \t2\().2d, \l4\().2d
+ trn1 \l3\().2d, \l3\().2d, \t1\().2d
+ trn2 \l5\().2d, \t0\().2d, \l5\().2d
+.endm
+
+
+#define CENTERJSAMPLE 128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ * JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define XFIX_P_0_298 v0.h[0]
+#define XFIX_N_0_390 v0.h[1]
+#define XFIX_P_0_541 v0.h[2]
+#define XFIX_P_0_765 v0.h[3]
+#define XFIX_N_0_899 v0.h[4]
+#define XFIX_P_1_175 v0.h[5]
+#define XFIX_P_1_501 v0.h[6]
+#define XFIX_N_1_847 v0.h[7]
+#define XFIX_N_1_961 v1.h[0]
+#define XFIX_P_2_053 v1.h[1]
+#define XFIX_N_2_562 v1.h[2]
+#define XFIX_P_3_072 v1.h[3]
+
+asm_function jsimd_idct_islow_neon
+ DCT_TABLE .req x0
+ COEF_BLOCK .req x1
+ OUTPUT_BUF .req x2
+ OUTPUT_COL .req x3
+ TMP1 .req x0
+ TMP2 .req x1
+ TMP3 .req x9
+ TMP4 .req x10
+ TMP5 .req x11
+ TMP6 .req x12
+ TMP7 .req x13
+ TMP8 .req x14
+
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
+ instruction ensures that those bits are set to zero. */
+ uxtw x3, w3
+
+ sub sp, sp, #64
+ get_symbol_loc x15, Ljsimd_idct_islow_neon_consts
+ mov x10, sp
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
+ ld1 {v0.8h, v1.8h}, [x15]
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
+ ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
+ ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
+ ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
+
+ cmeq v16.8h, v3.8h, #0
+ cmeq v26.8h, v4.8h, #0
+ cmeq v27.8h, v5.8h, #0
+ cmeq v28.8h, v6.8h, #0
+ cmeq v29.8h, v7.8h, #0
+ cmeq v30.8h, v8.8h, #0
+ cmeq v31.8h, v9.8h, #0
+
+ and v10.16b, v16.16b, v26.16b
+ and v11.16b, v27.16b, v28.16b
+ and v12.16b, v29.16b, v30.16b
+ and v13.16b, v31.16b, v10.16b
+ and v14.16b, v11.16b, v12.16b
+ mul v2.8h, v2.8h, v18.8h
+ and v15.16b, v13.16b, v14.16b
+ shl v10.8h, v2.8h, #(PASS1_BITS)
+ sqxtn v16.8b, v15.8h
+ mov TMP1, v16.d[0]
+ mvn TMP2, TMP1
+
+ cbnz TMP2, 2f
+ /* case all AC coeffs are zeros */
+ dup v2.2d, v10.d[0]
+ dup v6.2d, v10.d[1]
+ mov v3.16b, v2.16b
+ mov v7.16b, v6.16b
+ mov v4.16b, v2.16b
+ mov v8.16b, v6.16b
+ mov v5.16b, v2.16b
+ mov v9.16b, v6.16b
+1:
+ /* for this transpose, we should organise data like this:
+ * 00, 01, 02, 03, 40, 41, 42, 43
+ * 10, 11, 12, 13, 50, 51, 52, 53
+ * 20, 21, 22, 23, 60, 61, 62, 63
+ * 30, 31, 32, 33, 70, 71, 72, 73
+ * 04, 05, 06, 07, 44, 45, 46, 47
+ * 14, 15, 16, 17, 54, 55, 56, 57
+ * 24, 25, 26, 27, 64, 65, 66, 67
+ * 34, 35, 36, 37, 74, 75, 76, 77
+ */
+ trn1 v28.8h, v2.8h, v3.8h
+ trn1 v29.8h, v4.8h, v5.8h
+ trn1 v30.8h, v6.8h, v7.8h
+ trn1 v31.8h, v8.8h, v9.8h
+ trn2 v16.8h, v2.8h, v3.8h
+ trn2 v17.8h, v4.8h, v5.8h
+ trn2 v18.8h, v6.8h, v7.8h
+ trn2 v19.8h, v8.8h, v9.8h
+ trn1 v2.4s, v28.4s, v29.4s
+ trn1 v6.4s, v30.4s, v31.4s
+ trn1 v3.4s, v16.4s, v17.4s
+ trn1 v7.4s, v18.4s, v19.4s
+ trn2 v4.4s, v28.4s, v29.4s
+ trn2 v8.4s, v30.4s, v31.4s
+ trn2 v5.4s, v16.4s, v17.4s
+ trn2 v9.4s, v18.4s, v19.4s
+ /* Even part: reverse the even part of the forward DCT. */
+ add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+ add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+ sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+ sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+ mov v21.16b, v19.16b /* tmp3 = z1 */
+ mov v20.16b, v18.16b /* tmp3 = z1 */
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+ smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+ smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+ sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+ sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+ add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
+ sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
+ add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
+ sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
+ add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
+ sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
+ add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
+ sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
+ add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
+
+ smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+ smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+ add v23.4s, v23.4s, v27.4s /* z3 += z5 */
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
+ add v25.4s, v25.4s, v27.4s /* z4 += z5 */
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
+
+ add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
+ add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
+ add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
+ add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
+ add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
+ add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
+ add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
+ add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
+
+ add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
+ add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
+ add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
+ add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
+ add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
+ add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
+ add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
+ add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
+ add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
+ sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
+ sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
+ add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
+ add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
+ sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
+ sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
+ add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
+ add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
+ sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
+ sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
+ add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
+ add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
+ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
+ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
+
+ shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+ shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+ shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+ shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+ shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+ shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+ shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+ shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+ movi v0.16b, #(CENTERJSAMPLE)
+ /* Prepare pointers (dual-issue with Neon instructions) */
+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
+ sqrshrn v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ ldp TMP3, TMP4, [OUTPUT_BUF], 16
+ sqrshrn v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ add TMP1, TMP1, OUTPUT_COL
+ sqrshrn v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ add TMP2, TMP2, OUTPUT_COL
+ sqrshrn v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ add TMP3, TMP3, OUTPUT_COL
+ sqrshrn2 v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ add TMP4, TMP4, OUTPUT_COL
+ sqrshrn2 v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ ldp TMP5, TMP6, [OUTPUT_BUF], 16
+ sqrshrn2 v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ ldp TMP7, TMP8, [OUTPUT_BUF], 16
+ sqrshrn2 v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ add TMP5, TMP5, OUTPUT_COL
+ add v16.16b, v28.16b, v0.16b
+ add TMP6, TMP6, OUTPUT_COL
+ add v18.16b, v29.16b, v0.16b
+ add TMP7, TMP7, OUTPUT_COL
+ add v20.16b, v30.16b, v0.16b
+ add TMP8, TMP8, OUTPUT_COL
+ add v22.16b, v31.16b, v0.16b
+
+ /* Transpose the final 8-bit samples */
+ trn1 v28.16b, v16.16b, v18.16b
+ trn1 v30.16b, v20.16b, v22.16b
+ trn2 v29.16b, v16.16b, v18.16b
+ trn2 v31.16b, v20.16b, v22.16b
+
+ trn1 v16.8h, v28.8h, v30.8h
+ trn2 v18.8h, v28.8h, v30.8h
+ trn1 v20.8h, v29.8h, v31.8h
+ trn2 v22.8h, v29.8h, v31.8h
+
+ uzp1 v28.4s, v16.4s, v18.4s
+ uzp2 v30.4s, v16.4s, v18.4s
+ uzp1 v29.4s, v20.4s, v22.4s
+ uzp2 v31.4s, v20.4s, v22.4s
+
+ /* Store results to the output buffer */
+ st1 {v28.d}[0], [TMP1]
+ st1 {v29.d}[0], [TMP2]
+ st1 {v28.d}[1], [TMP3]
+ st1 {v29.d}[1], [TMP4]
+ st1 {v30.d}[0], [TMP5]
+ st1 {v31.d}[0], [TMP6]
+ st1 {v30.d}[1], [TMP7]
+ st1 {v31.d}[1], [TMP8]
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
+ blr x30
+
+.balign 16
+2:
+ mul v3.8h, v3.8h, v19.8h
+ mul v4.8h, v4.8h, v20.8h
+ mul v5.8h, v5.8h, v21.8h
+ add TMP4, xzr, TMP2, LSL #32
+ mul v6.8h, v6.8h, v22.8h
+ mul v7.8h, v7.8h, v23.8h
+ adds TMP3, xzr, TMP2, LSR #32
+ mul v8.8h, v8.8h, v24.8h
+ mul v9.8h, v9.8h, v25.8h
+ b.ne 3f
+ /* Right AC coef is zero */
+ dup v15.2d, v10.d[1]
+ /* Even part: reverse the even part of the forward DCT. */
+ add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+ add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+ sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+ mov v20.16b, v18.16b /* tmp3 = z1 */
+ sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+ add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
+ sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
+ add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
+ sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
+ add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */
+
+ smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
+
+ add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
+ add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
+ add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
+ add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
+
+ add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
+ add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
+ add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
+ add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
+ sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
+ add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
+ sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
+ add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
+ sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
+ add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
+ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
+
+ rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+ mov v6.16b, v15.16b
+ mov v7.16b, v15.16b
+ mov v8.16b, v15.16b
+ mov v9.16b, v15.16b
+ b 1b
+
+.balign 16
+3:
+ cbnz TMP4, 4f
+ /* Left AC coef is zero */
+ dup v14.2d, v10.d[0]
+ /* Even part: reverse the even part of the forward DCT. */
+ add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+ add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+ sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+ mov v21.16b, v19.16b /* tmp3 = z1 */
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+ smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+ add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
+ sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
+ add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
+ sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
+ add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
+
+ smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+ add v23.4s, v23.4s, v27.4s /* z3 += z5 */
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
+ add v25.4s, v25.4s, v27.4s /* z4 += z5 */
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
+
+ add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
+ add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
+ add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
+ add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
+
+ add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
+ add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
+ add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
+ add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
+ sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
+ add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
+ sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
+ add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
+ sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
+ add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
+ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
+
+ mov v2.16b, v14.16b
+ mov v3.16b, v14.16b
+ mov v4.16b, v14.16b
+ mov v5.16b, v14.16b
+ rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+ b 1b
+
+.balign 16
+4:
+ /* "No" AC coef is zero */
+ /* Even part: reverse the even part of the forward DCT. */
+ add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+ add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+ sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+ sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+ mov v21.16b, v19.16b /* tmp3 = z1 */
+ mov v20.16b, v18.16b /* tmp3 = z1 */
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+ smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+ smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+ sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+ sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+ add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
+ sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
+ add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
+ sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
+ add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
+ sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
+ add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
+ sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
+ add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
+
+ smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+ smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+ add v23.4s, v23.4s, v27.4s /* z3 += z5 */
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
+ add v25.4s, v25.4s, v27.4s /* z4 += z5 */
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
+
+ add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
+ add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
+ add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
+ add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
+ add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
+ add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
+ add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
+ add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
+
+ add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
+ add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
+ add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
+ add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
+ add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
+ add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
+ add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
+ add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
+ add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
+ sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
+ sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
+ add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
+ add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
+ sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
+ sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
+ add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
+ add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
+ sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
+ sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
+ add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
+ add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
+ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
+ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
+
+ rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+ b 1b
+
+ .unreq DCT_TABLE
+ .unreq COEF_BLOCK
+ .unreq OUTPUT_BUF
+ .unreq OUTPUT_COL
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq TMP4
+ .unreq TMP5
+ .unreq TMP6
+ .unreq TMP7
+ .unreq TMP8
+
+#undef CENTERJSAMPLE
+#undef CONST_BITS
+#undef PASS1_BITS
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_ycc_extrgb_convert_neon
+ * jsimd_ycc_extbgr_convert_neon
+ * jsimd_ycc_extrgbx_convert_neon
+ * jsimd_ycc_extbgrx_convert_neon
+ * jsimd_ycc_extxbgr_convert_neon
+ * jsimd_ycc_extxrgb_convert_neon
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro do_load size
+ .if \size == 8
+ ld1 {v4.8b}, [U], 8
+ ld1 {v5.8b}, [V], 8
+ ld1 {v0.8b}, [Y], 8
+ prfm pldl1keep, [U, #64]
+ prfm pldl1keep, [V, #64]
+ prfm pldl1keep, [Y, #64]
+ .elseif \size == 4
+ ld1 {v4.b}[0], [U], 1
+ ld1 {v4.b}[1], [U], 1
+ ld1 {v4.b}[2], [U], 1
+ ld1 {v4.b}[3], [U], 1
+ ld1 {v5.b}[0], [V], 1
+ ld1 {v5.b}[1], [V], 1
+ ld1 {v5.b}[2], [V], 1
+ ld1 {v5.b}[3], [V], 1
+ ld1 {v0.b}[0], [Y], 1
+ ld1 {v0.b}[1], [Y], 1
+ ld1 {v0.b}[2], [Y], 1
+ ld1 {v0.b}[3], [Y], 1
+ .elseif \size == 2
+ ld1 {v4.b}[4], [U], 1
+ ld1 {v4.b}[5], [U], 1
+ ld1 {v5.b}[4], [V], 1
+ ld1 {v5.b}[5], [V], 1
+ ld1 {v0.b}[4], [Y], 1
+ ld1 {v0.b}[5], [Y], 1
+ .elseif \size == 1
+ ld1 {v4.b}[6], [U], 1
+ ld1 {v5.b}[6], [V], 1
+ ld1 {v0.b}[6], [Y], 1
+ .else
+ .error unsupported macroblock size
+ .endif
+.endm
+
+.macro do_store bpp, size, fast_st3
+ .if \bpp == 24
+ .if \size == 8
+ .if \fast_st3 == 1
+ st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
+ .else
+ st1 {v10.b}[0], [RGB], #1
+ st1 {v11.b}[0], [RGB], #1
+ st1 {v12.b}[0], [RGB], #1
+
+ st1 {v10.b}[1], [RGB], #1
+ st1 {v11.b}[1], [RGB], #1
+ st1 {v12.b}[1], [RGB], #1
+
+ st1 {v10.b}[2], [RGB], #1
+ st1 {v11.b}[2], [RGB], #1
+ st1 {v12.b}[2], [RGB], #1
+
+ st1 {v10.b}[3], [RGB], #1
+ st1 {v11.b}[3], [RGB], #1
+ st1 {v12.b}[3], [RGB], #1
+
+ st1 {v10.b}[4], [RGB], #1
+ st1 {v11.b}[4], [RGB], #1
+ st1 {v12.b}[4], [RGB], #1
+
+ st1 {v10.b}[5], [RGB], #1
+ st1 {v11.b}[5], [RGB], #1
+ st1 {v12.b}[5], [RGB], #1
+
+ st1 {v10.b}[6], [RGB], #1
+ st1 {v11.b}[6], [RGB], #1
+ st1 {v12.b}[6], [RGB], #1
+
+ st1 {v10.b}[7], [RGB], #1
+ st1 {v11.b}[7], [RGB], #1
+ st1 {v12.b}[7], [RGB], #1
+ .endif
+ .elseif \size == 4
+ st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
+ st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
+ st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
+ st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
+ .elseif \size == 2
+ st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
+ st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
+ .elseif \size == 1
+ st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
+ .else
+ .error unsupported macroblock size
+ .endif
+ .elseif \bpp == 32
+ .if \size == 8
+ st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
+ .elseif \size == 4
+ st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
+ st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
+ st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
+ st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
+ .elseif \size == 2
+ st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
+ st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
+ .elseif \size == 1
+ st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
+ .else
+ .error unsupported macroblock size
+ .endif
+ .elseif \bpp == 16
+ .if \size == 8
+ st1 {v25.8h}, [RGB], 16
+ .elseif \size == 4
+ st1 {v25.4h}, [RGB], 8
+ .elseif \size == 2
+ st1 {v25.h}[4], [RGB], 2
+ st1 {v25.h}[5], [RGB], 2
+ .elseif \size == 1
+ st1 {v25.h}[6], [RGB], 2
+ .else
+ .error unsupported macroblock size
+ .endif
+ .else
+ .error unsupported bpp
+ .endif
+.endm
+
+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
+ g_offs, gsize, b_offs, bsize, \
+ defsize, fast_st3
+
+/*
+ * 2-stage pipelined YCbCr->RGB conversion
+ */
+
+.macro do_yuv_to_rgb_stage1
+ uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
+ smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
+ smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
+.endm
+
+.macro do_yuv_to_rgb_stage2
+ rshrn v20.4h, v20.4s, #15
+ rshrn2 v20.8h, v22.4s, #15
+ rshrn v24.4h, v24.4s, #14
+ rshrn2 v24.8h, v26.4s, #14
+ rshrn v28.4h, v28.4s, #14
+ rshrn2 v28.8h, v30.4s, #14
+ uaddw v20.8h, v20.8h, v0.8b
+ uaddw v24.8h, v24.8h, v0.8b
+ uaddw v28.8h, v28.8h, v0.8b
+ .if \bpp != 16
+ sqxtun v1\g_offs\defsize, v20.8h
+ sqxtun v1\r_offs\defsize, v24.8h
+ sqxtun v1\b_offs\defsize, v28.8h
+ .else
+ sqshlu v21.8h, v20.8h, #8
+ sqshlu v25.8h, v24.8h, #8
+ sqshlu v29.8h, v28.8h, #8
+ sri v25.8h, v21.8h, #5
+ sri v25.8h, v29.8h, #11
+ .endif
+.endm
+
+.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
+ rshrn v20.4h, v20.4s, #15
+ rshrn v24.4h, v24.4s, #14
+ rshrn v28.4h, v28.4s, #14
+ ld1 {v4.8b}, [U], 8
+ rshrn2 v20.8h, v22.4s, #15
+ rshrn2 v24.8h, v26.4s, #14
+ rshrn2 v28.8h, v30.4s, #14
+ ld1 {v5.8b}, [V], 8
+ uaddw v20.8h, v20.8h, v0.8b
+ uaddw v24.8h, v24.8h, v0.8b
+ uaddw v28.8h, v28.8h, v0.8b
+ .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
+ sqxtun v1\g_offs\defsize, v20.8h
+ ld1 {v0.8b}, [Y], 8
+ sqxtun v1\r_offs\defsize, v24.8h
+ prfm pldl1keep, [U, #64]
+ prfm pldl1keep, [V, #64]
+ prfm pldl1keep, [Y, #64]
+ sqxtun v1\b_offs\defsize, v28.8h
+ uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
+ .else /**************************** rgb565 ********************************/
+ sqshlu v21.8h, v20.8h, #8
+ sqshlu v25.8h, v24.8h, #8
+ sqshlu v29.8h, v28.8h, #8
+ uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
+ ld1 {v0.8b}, [Y], 8
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
+ sri v25.8h, v21.8h, #5
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
+ prfm pldl1keep, [U, #64]
+ prfm pldl1keep, [V, #64]
+ prfm pldl1keep, [Y, #64]
+ sri v25.8h, v29.8h, #11
+ .endif
+ do_store \bpp, 8, \fast_st3
+ smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
+ smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
+.endm
+
+.macro do_yuv_to_rgb
+ do_yuv_to_rgb_stage1
+ do_yuv_to_rgb_stage2
+.endm
+
+.if \fast_st3 == 1
+asm_function jsimd_ycc_\colorid\()_convert_neon
+.else
+asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+.endif
+ OUTPUT_WIDTH .req w0
+ INPUT_BUF .req x1
+ INPUT_ROW .req w2
+ OUTPUT_BUF .req x3
+ NUM_ROWS .req w4
+
+ INPUT_BUF0 .req x5
+ INPUT_BUF1 .req x6
+ INPUT_BUF2 .req x1
+
+ RGB .req x7
+ Y .req x9
+ U .req x10
+ V .req x11
+ N .req w15
+
+ sub sp, sp, 64
+ mov x9, sp
+
+ /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
+ get_symbol_loc x15, Ljsimd_ycc_rgb_neon_consts
+
+ /* Save Neon registers */
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
+ ld1 {v0.4h, v1.4h}, [x15], 16
+ ld1 {v2.8h}, [x15]
+
+ ldr INPUT_BUF0, [INPUT_BUF]
+ ldr INPUT_BUF1, [INPUT_BUF, #8]
+ ldr INPUT_BUF2, [INPUT_BUF, #16]
+ .unreq INPUT_BUF
+
+ /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
+ movi v10.16b, #255
+ movi v13.16b, #255
+
+ /* Outer loop over scanlines */
+ cmp NUM_ROWS, #1
+ b.lt 9f
+0:
+ ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
+ ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
+ mov N, OUTPUT_WIDTH
+ ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
+ add INPUT_ROW, INPUT_ROW, #1
+ ldr RGB, [OUTPUT_BUF], #8
+
+ /* Inner loop over pixels */
+ subs N, N, #8
+ b.lt 3f
+ do_load 8
+ do_yuv_to_rgb_stage1
+ subs N, N, #8
+ b.lt 2f
+1:
+ do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
+ subs N, N, #8
+ b.ge 1b
+2:
+ do_yuv_to_rgb_stage2
+ do_store \bpp, 8, \fast_st3
+ tst N, #7
+ b.eq 8f
+3:
+ tst N, #4
+ b.eq 3f
+ do_load 4
+3:
+ tst N, #2
+ b.eq 4f
+ do_load 2
+4:
+ tst N, #1
+ b.eq 5f
+ do_load 1
+5:
+ do_yuv_to_rgb
+ tst N, #4
+ b.eq 6f
+ do_store \bpp, 4, \fast_st3
+6:
+ tst N, #2
+ b.eq 7f
+ do_store \bpp, 2, \fast_st3
+7:
+ tst N, #1
+ b.eq 8f
+ do_store \bpp, 1, \fast_st3
+8:
+ subs NUM_ROWS, NUM_ROWS, #1
+ b.gt 0b
+9:
+ /* Restore all registers and return */
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ br x30
+ .unreq OUTPUT_WIDTH
+ .unreq INPUT_ROW
+ .unreq OUTPUT_BUF
+ .unreq NUM_ROWS
+ .unreq INPUT_BUF0
+ .unreq INPUT_BUF1
+ .unreq INPUT_BUF2
+ .unreq RGB
+ .unreq Y
+ .unreq U
+ .unreq V
+ .unreq N
+
+.purgem do_yuv_to_rgb
+.purgem do_yuv_to_rgb_stage1
+.purgem do_yuv_to_rgb_stage2
+.purgem do_yuv_to_rgb_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/
+generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1
+generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1
+generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1
+generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1
+generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1
+generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1
+generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1
+
+generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0
+generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0
+
+.purgem do_load
+.purgem do_store
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+ .if \size == 8
+ st1 {v20.8b}, [Y], #8
+ st1 {v21.8b}, [U], #8
+ st1 {v22.8b}, [V], #8
+ .elseif \size == 4
+ st1 {v20.b}[0], [Y], #1
+ st1 {v20.b}[1], [Y], #1
+ st1 {v20.b}[2], [Y], #1
+ st1 {v20.b}[3], [Y], #1
+ st1 {v21.b}[0], [U], #1
+ st1 {v21.b}[1], [U], #1
+ st1 {v21.b}[2], [U], #1
+ st1 {v21.b}[3], [U], #1
+ st1 {v22.b}[0], [V], #1
+ st1 {v22.b}[1], [V], #1
+ st1 {v22.b}[2], [V], #1
+ st1 {v22.b}[3], [V], #1
+ .elseif \size == 2
+ st1 {v20.b}[4], [Y], #1
+ st1 {v20.b}[5], [Y], #1
+ st1 {v21.b}[4], [U], #1
+ st1 {v21.b}[5], [U], #1
+ st1 {v22.b}[4], [V], #1
+ st1 {v22.b}[5], [V], #1
+ .elseif \size == 1
+ st1 {v20.b}[6], [Y], #1
+ st1 {v21.b}[6], [U], #1
+ st1 {v22.b}[6], [V], #1
+ .else
+ .error unsupported macroblock size
+ .endif
+.endm
+
+.macro do_load bpp, size, fast_ld3
+ .if \bpp == 24
+ .if \size == 8
+ .if \fast_ld3 == 1
+ ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24
+ .else
+ ld1 {v10.b}[0], [RGB], #1
+ ld1 {v11.b}[0], [RGB], #1
+ ld1 {v12.b}[0], [RGB], #1
+
+ ld1 {v10.b}[1], [RGB], #1
+ ld1 {v11.b}[1], [RGB], #1
+ ld1 {v12.b}[1], [RGB], #1
+
+ ld1 {v10.b}[2], [RGB], #1
+ ld1 {v11.b}[2], [RGB], #1
+ ld1 {v12.b}[2], [RGB], #1
+
+ ld1 {v10.b}[3], [RGB], #1
+ ld1 {v11.b}[3], [RGB], #1
+ ld1 {v12.b}[3], [RGB], #1
+
+ ld1 {v10.b}[4], [RGB], #1
+ ld1 {v11.b}[4], [RGB], #1
+ ld1 {v12.b}[4], [RGB], #1
+
+ ld1 {v10.b}[5], [RGB], #1
+ ld1 {v11.b}[5], [RGB], #1
+ ld1 {v12.b}[5], [RGB], #1
+
+ ld1 {v10.b}[6], [RGB], #1
+ ld1 {v11.b}[6], [RGB], #1
+ ld1 {v12.b}[6], [RGB], #1
+
+ ld1 {v10.b}[7], [RGB], #1
+ ld1 {v11.b}[7], [RGB], #1
+ ld1 {v12.b}[7], [RGB], #1
+ .endif
+ prfm pldl1keep, [RGB, #128]
+ .elseif \size == 4
+ ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3
+ ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3
+ ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3
+ ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3
+ .elseif \size == 2
+ ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3
+ ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3
+ .elseif \size == 1
+ ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3
+ .else
+ .error unsupported macroblock size
+ .endif
+ .elseif \bpp == 32
+ .if \size == 8
+ ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
+ prfm pldl1keep, [RGB, #128]
+ .elseif \size == 4
+ ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
+ ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
+ ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
+ ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
+ .elseif \size == 2
+ ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
+ ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
+ .elseif \size == 1
+ ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
+ .else
+ .error unsupported macroblock size
+ .endif
+ .else
+ .error unsupported bpp
+ .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
+ b_offs, fast_ld3
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+ ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */
+ ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */
+ ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */
+ rev64 v18.4s, v1.4s
+ rev64 v26.4s, v1.4s
+ rev64 v28.4s, v1.4s
+ rev64 v30.4s, v1.4s
+ umull v14.4s, v4.4h, v0.h[0]
+ umull2 v16.4s, v4.8h, v0.h[0]
+ umlsl v18.4s, v4.4h, v0.h[3]
+ umlsl2 v26.4s, v4.8h, v0.h[3]
+ umlal v28.4s, v4.4h, v0.h[5]
+ umlal2 v30.4s, v4.8h, v0.h[5]
+ umlal v14.4s, v6.4h, v0.h[1]
+ umlal2 v16.4s, v6.8h, v0.h[1]
+ umlsl v18.4s, v6.4h, v0.h[4]
+ umlsl2 v26.4s, v6.8h, v0.h[4]
+ umlsl v28.4s, v6.4h, v0.h[6]
+ umlsl2 v30.4s, v6.8h, v0.h[6]
+ umlal v14.4s, v8.4h, v0.h[2]
+ umlal2 v16.4s, v8.8h, v0.h[2]
+ umlal v18.4s, v8.4h, v0.h[5]
+ umlal2 v26.4s, v8.8h, v0.h[5]
+ umlsl v28.4s, v8.4h, v0.h[7]
+ umlsl2 v30.4s, v8.8h, v0.h[7]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+ rshrn v20.4h, v14.4s, #16
+ shrn v22.4h, v18.4s, #16
+ shrn v24.4h, v28.4s, #16
+ rshrn2 v20.8h, v16.4s, #16
+ shrn2 v22.8h, v26.4s, #16
+ shrn2 v24.8h, v30.4s, #16
+ xtn v20.8b, v20.8h /* v20 = y */
+ xtn v21.8b, v22.8h /* v21 = u */
+ xtn v22.8b, v24.8h /* v22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+ do_rgb_to_yuv_stage1
+ do_rgb_to_yuv_stage2
+.endm
+
+/* TODO: expand macros and interleave instructions if some in-order
+ * AArch64 processor actually can dual-issue LOAD/STORE with ALU */
+.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
+ do_rgb_to_yuv_stage2
+ do_load \bpp, 8, \fast_ld3
+ st1 {v20.8b}, [Y], #8
+ st1 {v21.8b}, [U], #8
+ st1 {v22.8b}, [V], #8
+ do_rgb_to_yuv_stage1
+.endm
+
+.if \fast_ld3 == 1
+asm_function jsimd_\colorid\()_ycc_convert_neon
+.else
+asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
+.endif
+ OUTPUT_WIDTH .req w0
+ INPUT_BUF .req x1
+ OUTPUT_BUF .req x2
+ OUTPUT_ROW .req w3
+ NUM_ROWS .req w4
+
+ OUTPUT_BUF0 .req x5
+ OUTPUT_BUF1 .req x6
+ OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */
+
+ RGB .req x7
+ Y .req x9
+ U .req x10
+ V .req x11
+ N .req w12
+
+ /* Load constants to d0, d1, d2, d3 */
+ get_symbol_loc x13, Ljsimd_rgb_ycc_neon_consts
+ ld1 {v0.8h, v1.8h}, [x13]
+
+ ldr OUTPUT_BUF0, [OUTPUT_BUF]
+ ldr OUTPUT_BUF1, [OUTPUT_BUF, #8]
+ ldr OUTPUT_BUF2, [OUTPUT_BUF, #16]
+ .unreq OUTPUT_BUF
+
+ /* Save Neon registers */
+ sub sp, sp, #64
+ mov x9, sp
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
+
+ /* Outer loop over scanlines */
+ cmp NUM_ROWS, #1
+ b.lt 9f
+0:
+ ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
+ ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
+ mov N, OUTPUT_WIDTH
+ ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
+ add OUTPUT_ROW, OUTPUT_ROW, #1
+ ldr RGB, [INPUT_BUF], #8
+
+ /* Inner loop over pixels */
+ subs N, N, #8
+ b.lt 3f
+ do_load \bpp, 8, \fast_ld3
+ do_rgb_to_yuv_stage1
+ subs N, N, #8
+ b.lt 2f
+1:
+ do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
+ subs N, N, #8
+ b.ge 1b
+2:
+ do_rgb_to_yuv_stage2
+ do_store 8
+ tst N, #7
+ b.eq 8f
+3:
+ tbz N, #2, 3f
+ do_load \bpp, 4, \fast_ld3
+3:
+ tbz N, #1, 4f
+ do_load \bpp, 2, \fast_ld3
+4:
+ tbz N, #0, 5f
+ do_load \bpp, 1, \fast_ld3
+5:
+ do_rgb_to_yuv
+ tbz N, #2, 6f
+ do_store 4
+6:
+ tbz N, #1, 7f
+ do_store 2
+7:
+ tbz N, #0, 8f
+ do_store 1
+8:
+ subs NUM_ROWS, NUM_ROWS, #1
+ b.gt 0b
+9:
+ /* Restore all registers and return */
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ br x30
+
+ .unreq OUTPUT_WIDTH
+ .unreq OUTPUT_ROW
+ .unreq INPUT_BUF
+ .unreq NUM_ROWS
+ .unreq OUTPUT_BUF0
+ .unreq OUTPUT_BUF1
+ .unreq OUTPUT_BUF2
+ .unreq RGB
+ .unreq Y
+ .unreq U
+ .unreq V
+ .unreq N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R G B Fast LD3 */
+generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
+
+generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0
+generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0
+
+.purgem do_load
+.purgem do_store
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_fdct_islow_neon
+ *
+ * This file contains a slower but more accurate integer implementation of the
+ * forward DCT (Discrete Cosine Transform). The following code is based
+ * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
+ * more details.
+ *
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
+ * rid of a bunch of VLD1.16 instructions
+ */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+#define XFIX_P_0_298 v0.h[0]
+#define XFIX_N_0_390 v0.h[1]
+#define XFIX_P_0_541 v0.h[2]
+#define XFIX_P_0_765 v0.h[3]
+#define XFIX_N_0_899 v0.h[4]
+#define XFIX_P_1_175 v0.h[5]
+#define XFIX_P_1_501 v0.h[6]
+#define XFIX_N_1_847 v0.h[7]
+#define XFIX_N_1_961 v1.h[0]
+#define XFIX_P_2_053 v1.h[1]
+#define XFIX_N_2_562 v1.h[2]
+#define XFIX_P_3_072 v1.h[3]
+
+asm_function jsimd_fdct_islow_neon
+
+ DATA .req x0
+ TMP .req x9
+
+ /* Load constants */
+ get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts
+ ld1 {v0.8h, v1.8h}, [TMP]
+
+ /* Save Neon registers */
+ sub sp, sp, #64
+ mov x10, sp
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
+
+ /* Load all DATA into Neon registers with the following allocation:
+ * 0 1 2 3 | 4 5 6 7
+ * ---------+--------
+ * 0 | d16 | d17 | v16.8h
+ * 1 | d18 | d19 | v17.8h
+ * 2 | d20 | d21 | v18.8h
+ * 3 | d22 | d23 | v19.8h
+ * 4 | d24 | d25 | v20.8h
+ * 5 | d26 | d27 | v21.8h
+ * 6 | d28 | d29 | v22.8h
+ * 7 | d30 | d31 | v23.8h
+ */
+
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+ sub DATA, DATA, #64
+
+ /* Transpose */
+ transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+ /* 1-D FDCT */
+ add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
+ sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
+ add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
+ sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
+ add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
+ sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
+ add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
+ sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
+
+ /* even part */
+
+ add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
+ sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
+ add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
+ sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
+
+ add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
+ sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
+
+ add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
+
+ shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+ shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+
+ smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+ mov v22.16b, v18.16b
+ mov v25.16b, v24.16b
+
+ smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+ smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+ smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+ smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+ rshrn v18.4h, v18.4s, #DESCALE_P1
+ rshrn v22.4h, v22.4s, #DESCALE_P1
+ rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+ rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+ /* Odd part */
+
+ add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
+ add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
+ add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
+ add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
+ smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
+ smull2 v5.4s, v10.8h, XFIX_P_1_175
+ smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+ smlal2 v5.4s, v11.8h, XFIX_P_1_175
+
+ smull2 v24.4s, v28.8h, XFIX_P_0_298
+ smull2 v25.4s, v29.8h, XFIX_P_2_053
+ smull2 v26.4s, v30.8h, XFIX_P_3_072
+ smull2 v27.4s, v31.8h, XFIX_P_1_501
+ smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+ smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+ smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+ smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+ smull2 v12.4s, v8.8h, XFIX_N_0_899
+ smull2 v13.4s, v9.8h, XFIX_N_2_562
+ smull2 v14.4s, v10.8h, XFIX_N_1_961
+ smull2 v15.4s, v11.8h, XFIX_N_0_390
+ smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+ smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+ smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+ smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
+
+ add v10.4s, v10.4s, v4.4s /* z3 += z5 */
+ add v14.4s, v14.4s, v5.4s
+ add v11.4s, v11.4s, v4.4s /* z4 += z5 */
+ add v15.4s, v15.4s, v5.4s
+
+ add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
+ add v24.4s, v24.4s, v12.4s
+ add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
+ add v25.4s, v25.4s, v13.4s
+ add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
+ add v26.4s, v26.4s, v14.4s
+ add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
+ add v27.4s, v27.4s, v15.4s
+
+ add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
+ add v24.4s, v24.4s, v14.4s
+ add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
+ add v25.4s, v25.4s, v15.4s
+ add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
+ add v26.4s, v26.4s, v13.4s
+ add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
+ add v27.4s, v27.4s, v12.4s
+
+ rshrn v23.4h, v28.4s, #DESCALE_P1
+ rshrn v21.4h, v29.4s, #DESCALE_P1
+ rshrn v19.4h, v30.4s, #DESCALE_P1
+ rshrn v17.4h, v31.4s, #DESCALE_P1
+ rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+ rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+ rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+ rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+ /* Transpose */
+ transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+
+ /* 1-D FDCT */
+ add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
+ sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
+ add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
+ sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
+ add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
+ sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
+ add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
+ sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
+
+ /* even part */
+ add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
+ sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
+ add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
+ sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
+
+ add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
+ sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
+
+ add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
+
+ srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
+ srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
+
+ smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+ mov v22.16b, v18.16b
+ mov v25.16b, v24.16b
+
+ smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+ smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+ smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+ smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+ rshrn v18.4h, v18.4s, #DESCALE_P2
+ rshrn v22.4h, v22.4s, #DESCALE_P2
+ rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+ rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+ /* Odd part */
+ add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
+ add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
+ add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
+ add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
+
+ smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
+ smull2 v5.4s, v10.8h, XFIX_P_1_175
+ smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+ smlal2 v5.4s, v11.8h, XFIX_P_1_175
+
+ smull2 v24.4s, v28.8h, XFIX_P_0_298
+ smull2 v25.4s, v29.8h, XFIX_P_2_053
+ smull2 v26.4s, v30.8h, XFIX_P_3_072
+ smull2 v27.4s, v31.8h, XFIX_P_1_501
+ smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+ smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+ smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+ smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+ smull2 v12.4s, v8.8h, XFIX_N_0_899
+ smull2 v13.4s, v9.8h, XFIX_N_2_562
+ smull2 v14.4s, v10.8h, XFIX_N_1_961
+ smull2 v15.4s, v11.8h, XFIX_N_0_390
+ smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+ smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+ smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+ smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
+
+ add v10.4s, v10.4s, v4.4s
+ add v14.4s, v14.4s, v5.4s
+ add v11.4s, v11.4s, v4.4s
+ add v15.4s, v15.4s, v5.4s
+
+ add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
+ add v24.4s, v24.4s, v12.4s
+ add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
+ add v25.4s, v25.4s, v13.4s
+ add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
+ add v26.4s, v26.4s, v14.4s
+ add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
+ add v27.4s, v27.4s, v15.4s
+
+ add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
+ add v24.4s, v24.4s, v14.4s
+ add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
+ add v25.4s, v25.4s, v15.4s
+ add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
+ add v26.4s, v26.4s, v13.4s
+ add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
+ add v27.4s, v27.4s, v12.4s
+
+ rshrn v23.4h, v28.4s, #DESCALE_P2
+ rshrn v21.4h, v29.4s, #DESCALE_P2
+ rshrn v19.4h, v30.4s, #DESCALE_P2
+ rshrn v17.4h, v31.4s, #DESCALE_P2
+ rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+ rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+ rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+ rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+ /* store results */
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+
+ /* Restore Neon registers */
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+ br x30
+
+ .unreq DATA
+ .unreq TMP
+
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
+
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(JOCTET *)
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
+ * JCOEFPTR block, int last_dc_val,
+ * c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ *
+ */
+
+ BUFFER .req x1
+ PUT_BUFFER .req x6
+ PUT_BITS .req x7
+ PUT_BITSw .req w7
+
+.macro emit_byte
+ sub PUT_BITS, PUT_BITS, #0x8
+ lsr x19, PUT_BUFFER, PUT_BITS
+ uxtb w19, w19
+ strb w19, [BUFFER, #1]!
+ cmp w19, #0xff
+ b.ne 14f
+ strb wzr, [BUFFER, #1]!
+14:
+.endm
+.macro put_bits CODE, SIZE
+ lsl PUT_BUFFER, PUT_BUFFER, \SIZE
+ add PUT_BITS, PUT_BITS, \SIZE
+ orr PUT_BUFFER, PUT_BUFFER, \CODE
+.endm
+.macro checkbuf31
+ cmp PUT_BITS, #0x20
+ b.lt 31f
+ emit_byte
+ emit_byte
+ emit_byte
+ emit_byte
+31:
+.endm
+.macro checkbuf47
+ cmp PUT_BITS, #0x30
+ b.lt 47f
+ emit_byte
+ emit_byte
+ emit_byte
+ emit_byte
+ emit_byte
+ emit_byte
+47:
+.endm
+
+.macro generate_jsimd_huff_encode_one_block fast_tbl
+
+.if \fast_tbl == 1
+asm_function jsimd_huff_encode_one_block_neon
+.else
+asm_function jsimd_huff_encode_one_block_neon_slowtbl
+.endif
+ sub sp, sp, 272
+ sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
+ /* Save Arm registers */
+ stp x19, x20, [sp]
+ get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts
+ ldr PUT_BUFFER, [x0, #0x10]
+ ldr PUT_BITSw, [x0, #0x18]
+ ldrsh w12, [x2] /* load DC coeff in w12 */
+ /* prepare data */
+.if \fast_tbl == 1
+ ld1 {v23.16b}, [x15], #16
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
+ ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
+ ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
+ sub w12, w12, w3 /* last_dc_val, not used afterwards */
+ /* ZigZag 8x8 */
+ tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
+ tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
+ tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
+ tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
+ tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
+ tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
+ tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
+ tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
+ ins v0.h[0], w12
+ tbx v1.16b, {v28.16b}, v16.16b
+ tbx v2.16b, {v29.16b, v30.16b}, v17.16b
+ tbx v5.16b, {v29.16b, v30.16b}, v18.16b
+ tbx v6.16b, {v31.16b}, v19.16b
+.else
+ add x13, x2, #0x22
+ sub w12, w12, w3 /* last_dc_val, not used afterwards */
+ ld1 {v23.16b}, [x15]
+ add x14, x2, #0x18
+ add x3, x2, #0x36
+ ins v0.h[0], w12
+ add x9, x2, #0x2
+ ld1 {v1.h}[0], [x13]
+ add x15, x2, #0x30
+ ld1 {v2.h}[0], [x14]
+ add x19, x2, #0x26
+ ld1 {v3.h}[0], [x3]
+ add x20, x2, #0x28
+ ld1 {v0.h}[1], [x9]
+ add x12, x2, #0x10
+ ld1 {v1.h}[1], [x15]
+ add x13, x2, #0x40
+ ld1 {v2.h}[1], [x19]
+ add x14, x2, #0x34
+ ld1 {v3.h}[1], [x20]
+ add x3, x2, #0x1a
+ ld1 {v0.h}[2], [x12]
+ add x9, x2, #0x20
+ ld1 {v1.h}[2], [x13]
+ add x15, x2, #0x32
+ ld1 {v2.h}[2], [x14]
+ add x19, x2, #0x42
+ ld1 {v3.h}[2], [x3]
+ add x20, x2, #0xc
+ ld1 {v0.h}[3], [x9]
+ add x12, x2, #0x12
+ ld1 {v1.h}[3], [x15]
+ add x13, x2, #0x24
+ ld1 {v2.h}[3], [x19]
+ add x14, x2, #0x50
+ ld1 {v3.h}[3], [x20]
+ add x3, x2, #0xe
+ ld1 {v0.h}[4], [x12]
+ add x9, x2, #0x4
+ ld1 {v1.h}[4], [x13]
+ add x15, x2, #0x16
+ ld1 {v2.h}[4], [x14]
+ add x19, x2, #0x60
+ ld1 {v3.h}[4], [x3]
+ add x20, x2, #0x1c
+ ld1 {v0.h}[5], [x9]
+ add x12, x2, #0x6
+ ld1 {v1.h}[5], [x15]
+ add x13, x2, #0x8
+ ld1 {v2.h}[5], [x19]
+ add x14, x2, #0x52
+ ld1 {v3.h}[5], [x20]
+ add x3, x2, #0x2a
+ ld1 {v0.h}[6], [x12]
+ add x9, x2, #0x14
+ ld1 {v1.h}[6], [x13]
+ add x15, x2, #0xa
+ ld1 {v2.h}[6], [x14]
+ add x19, x2, #0x44
+ ld1 {v3.h}[6], [x3]
+ add x20, x2, #0x38
+ ld1 {v0.h}[7], [x9]
+ add x12, x2, #0x46
+ ld1 {v1.h}[7], [x15]
+ add x13, x2, #0x3a
+ ld1 {v2.h}[7], [x19]
+ add x14, x2, #0x74
+ ld1 {v3.h}[7], [x20]
+ add x3, x2, #0x6a
+ ld1 {v4.h}[0], [x12]
+ add x9, x2, #0x54
+ ld1 {v5.h}[0], [x13]
+ add x15, x2, #0x2c
+ ld1 {v6.h}[0], [x14]
+ add x19, x2, #0x76
+ ld1 {v7.h}[0], [x3]
+ add x20, x2, #0x78
+ ld1 {v4.h}[1], [x9]
+ add x12, x2, #0x62
+ ld1 {v5.h}[1], [x15]
+ add x13, x2, #0x1e
+ ld1 {v6.h}[1], [x19]
+ add x14, x2, #0x68
+ ld1 {v7.h}[1], [x20]
+ add x3, x2, #0x7a
+ ld1 {v4.h}[2], [x12]
+ add x9, x2, #0x70
+ ld1 {v5.h}[2], [x13]
+ add x15, x2, #0x2e
+ ld1 {v6.h}[2], [x14]
+ add x19, x2, #0x5a
+ ld1 {v7.h}[2], [x3]
+ add x20, x2, #0x6c
+ ld1 {v4.h}[3], [x9]
+ add x12, x2, #0x72
+ ld1 {v5.h}[3], [x15]
+ add x13, x2, #0x3c
+ ld1 {v6.h}[3], [x19]
+ add x14, x2, #0x4c
+ ld1 {v7.h}[3], [x20]
+ add x3, x2, #0x5e
+ ld1 {v4.h}[4], [x12]
+ add x9, x2, #0x64
+ ld1 {v5.h}[4], [x13]
+ add x15, x2, #0x4a
+ ld1 {v6.h}[4], [x14]
+ add x19, x2, #0x3e
+ ld1 {v7.h}[4], [x3]
+ add x20, x2, #0x6e
+ ld1 {v4.h}[5], [x9]
+ add x12, x2, #0x56
+ ld1 {v5.h}[5], [x15]
+ add x13, x2, #0x58
+ ld1 {v6.h}[5], [x19]
+ add x14, x2, #0x4e
+ ld1 {v7.h}[5], [x20]
+ add x3, x2, #0x7c
+ ld1 {v4.h}[6], [x12]
+ add x9, x2, #0x48
+ ld1 {v5.h}[6], [x13]
+ add x15, x2, #0x66
+ ld1 {v6.h}[6], [x14]
+ add x19, x2, #0x5c
+ ld1 {v7.h}[6], [x3]
+ add x20, x2, #0x7e
+ ld1 {v4.h}[7], [x9]
+ ld1 {v5.h}[7], [x15]
+ ld1 {v6.h}[7], [x19]
+ ld1 {v7.h}[7], [x20]
+.endif
+ cmlt v24.8h, v0.8h, #0
+ cmlt v25.8h, v1.8h, #0
+ cmlt v26.8h, v2.8h, #0
+ cmlt v27.8h, v3.8h, #0
+ cmlt v28.8h, v4.8h, #0
+ cmlt v29.8h, v5.8h, #0
+ cmlt v30.8h, v6.8h, #0
+ cmlt v31.8h, v7.8h, #0
+ abs v0.8h, v0.8h
+ abs v1.8h, v1.8h
+ abs v2.8h, v2.8h
+ abs v3.8h, v3.8h
+ abs v4.8h, v4.8h
+ abs v5.8h, v5.8h
+ abs v6.8h, v6.8h
+ abs v7.8h, v7.8h
+ eor v24.16b, v24.16b, v0.16b
+ eor v25.16b, v25.16b, v1.16b
+ eor v26.16b, v26.16b, v2.16b
+ eor v27.16b, v27.16b, v3.16b
+ eor v28.16b, v28.16b, v4.16b
+ eor v29.16b, v29.16b, v5.16b
+ eor v30.16b, v30.16b, v6.16b
+ eor v31.16b, v31.16b, v7.16b
+ cmeq v16.8h, v0.8h, #0
+ cmeq v17.8h, v1.8h, #0
+ cmeq v18.8h, v2.8h, #0
+ cmeq v19.8h, v3.8h, #0
+ cmeq v20.8h, v4.8h, #0
+ cmeq v21.8h, v5.8h, #0
+ cmeq v22.8h, v6.8h, #0
+ xtn v16.8b, v16.8h
+ xtn v18.8b, v18.8h
+ xtn v20.8b, v20.8h
+ xtn v22.8b, v22.8h
+ umov w14, v0.h[0]
+ xtn2 v16.16b, v17.8h
+ umov w13, v24.h[0]
+ xtn2 v18.16b, v19.8h
+ clz w14, w14
+ xtn2 v20.16b, v21.8h
+ lsl w13, w13, w14
+ cmeq v17.8h, v7.8h, #0
+ sub w12, w14, #32
+ xtn2 v22.16b, v17.8h
+ lsr w13, w13, w14
+ and v16.16b, v16.16b, v23.16b
+ neg w12, w12
+ and v18.16b, v18.16b, v23.16b
+ add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
+ and v20.16b, v20.16b, v23.16b
+ add x15, sp, #0x90 /* x15 = t2 */
+ and v22.16b, v22.16b, v23.16b
+ ldr w10, [x4, x12, lsl #2]
+ addp v16.16b, v16.16b, v18.16b
+ ldrb w11, [x3, x12]
+ addp v20.16b, v20.16b, v22.16b
+ checkbuf47
+ addp v16.16b, v16.16b, v20.16b
+ put_bits x10, x11
+ addp v16.16b, v16.16b, v18.16b
+ checkbuf47
+ umov x9, v16.D[0]
+ put_bits x13, x12
+ cnt v17.8b, v16.8b
+ mvn x9, x9
+ addv B18, v17.8b
+ add x4, x5, #0x400 /* x4 = actbl->ehufsi */
+ umov w12, v18.b[0]
+ lsr x9, x9, #0x1 /* clear AC coeff */
+ ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */
+ rbit x9, x9 /* x9 = index0 */
+ ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
+ cmp w12, #(64-8)
+ add x11, sp, #16
+ b.lt 4f
+ cbz x9, 6f
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+ clz x2, x9
+ add x15, x15, x2, lsl #1
+ lsl x9, x9, x2
+ ldrh w20, [x15, #-126]
+2:
+ cmp x2, #0x10
+ b.lt 3f
+ sub x2, x2, #0x10
+ checkbuf47
+ put_bits x13, x14
+ b 2b
+3:
+ clz w20, w20
+ ldrh w3, [x15, #2]!
+ sub w11, w20, #32
+ lsl w3, w3, w20
+ neg w11, w11
+ lsr w3, w3, w20
+ add x2, x11, x2, lsl #4
+ lsl x9, x9, #0x1
+ ldr w12, [x5, x2, lsl #2]
+ ldrb w10, [x4, x2]
+ checkbuf31
+ put_bits x12, x10
+ put_bits x3, x11
+ cbnz x9, 1b
+ b 6f
+4:
+ movi v21.8h, #0x0010
+ clz v0.8h, v0.8h
+ clz v1.8h, v1.8h
+ clz v2.8h, v2.8h
+ clz v3.8h, v3.8h
+ clz v4.8h, v4.8h
+ clz v5.8h, v5.8h
+ clz v6.8h, v6.8h
+ clz v7.8h, v7.8h
+ ushl v24.8h, v24.8h, v0.8h
+ ushl v25.8h, v25.8h, v1.8h
+ ushl v26.8h, v26.8h, v2.8h
+ ushl v27.8h, v27.8h, v3.8h
+ ushl v28.8h, v28.8h, v4.8h
+ ushl v29.8h, v29.8h, v5.8h
+ ushl v30.8h, v30.8h, v6.8h
+ ushl v31.8h, v31.8h, v7.8h
+ neg v0.8h, v0.8h
+ neg v1.8h, v1.8h
+ neg v2.8h, v2.8h
+ neg v3.8h, v3.8h
+ neg v4.8h, v4.8h
+ neg v5.8h, v5.8h
+ neg v6.8h, v6.8h
+ neg v7.8h, v7.8h
+ ushl v24.8h, v24.8h, v0.8h
+ ushl v25.8h, v25.8h, v1.8h
+ ushl v26.8h, v26.8h, v2.8h
+ ushl v27.8h, v27.8h, v3.8h
+ ushl v28.8h, v28.8h, v4.8h
+ ushl v29.8h, v29.8h, v5.8h
+ ushl v30.8h, v30.8h, v6.8h
+ ushl v31.8h, v31.8h, v7.8h
+ add v0.8h, v21.8h, v0.8h
+ add v1.8h, v21.8h, v1.8h
+ add v2.8h, v21.8h, v2.8h
+ add v3.8h, v21.8h, v3.8h
+ add v4.8h, v21.8h, v4.8h
+ add v5.8h, v21.8h, v5.8h
+ add v6.8h, v21.8h, v6.8h
+ add v7.8h, v21.8h, v7.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+ clz x2, x9
+ add x15, x15, x2, lsl #1
+ lsl x9, x9, x2
+ ldrh w11, [x15, #-126]
+2:
+ cmp x2, #0x10
+ b.lt 3f
+ sub x2, x2, #0x10
+ checkbuf47
+ put_bits x13, x14
+ b 2b
+3:
+ ldrh w3, [x15, #2]!
+ add x2, x11, x2, lsl #4
+ lsl x9, x9, #0x1
+ ldr w12, [x5, x2, lsl #2]
+ ldrb w10, [x4, x2]
+ checkbuf31
+ put_bits x12, x10
+ put_bits x3, x11
+ cbnz x9, 1b
+6:
+ add x13, sp, #0x10e
+ cmp x15, x13
+ b.hs 1f
+ ldr w12, [x5]
+ ldrb w14, [x4]
+ checkbuf47
+ put_bits x12, x14
+1:
+ str PUT_BUFFER, [x0, #0x10]
+ str PUT_BITSw, [x0, #0x18]
+ ldp x19, x20, [sp], 16
+ add x0, BUFFER, #0x1
+ add sp, sp, 256
+ br x30
+
+.endm
+
+generate_jsimd_huff_encode_one_block 1
+generate_jsimd_huff_encode_one_block 0
+
+ .unreq BUFFER
+ .unreq PUT_BUFFER
+ .unreq PUT_BITS
+ .unreq PUT_BITSw
+
+.purgem emit_byte
+.purgem put_bits
+.purgem checkbuf31
+.purgem checkbuf47
diff --git a/media/libjpeg/simd/arm/align.h b/media/libjpeg/simd/arm/align.h
new file mode 100644
index 0000000000..cff4241e84
--- /dev/null
+++ b/media/libjpeg/simd/arm/align.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* How to obtain memory alignment for structures and variables */
+#if defined(_MSC_VER)
+#define ALIGN(alignment) __declspec(align(alignment))
+#elif defined(__clang__) || defined(__GNUC__)
+#define ALIGN(alignment) __attribute__((aligned(alignment)))
+#else
+#error "Unknown compiler"
+#endif
diff --git a/media/libjpeg/simd/arm/jccolor-neon.c b/media/libjpeg/simd/arm/jccolor-neon.c
new file mode 100644
index 0000000000..9fcc62dd25
--- /dev/null
+++ b/media/libjpeg/simd/arm/jccolor-neon.c
@@ -0,0 +1,160 @@
+/*
+ * jccolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> YCbCr conversion constants */
+
+#define F_0_298 19595
+#define F_0_587 38470
+#define F_0_113 7471
+#define F_0_168 11059
+#define F_0_331 21709
+#define F_0_500 32768
+#define F_0_418 27439
+#define F_0_081 5329
+
+ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
+ F_0_298, F_0_587, F_0_113, F_0_168,
+ F_0_331, F_0_500, F_0_418, F_0_081
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
diff --git a/media/libjpeg/simd/arm/jcgray-neon.c b/media/libjpeg/simd/arm/jcgray-neon.c
new file mode 100644
index 0000000000..71c7b2de21
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcgray-neon.c
@@ -0,0 +1,120 @@
+/*
+ * jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> Grayscale conversion constants */
+
+#define F_0_298 19595
+#define F_0_587 38470
+#define F_0_113 7471
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extrgbx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extbgrx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extxbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extxrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
diff --git a/media/libjpeg/simd/arm/jcgryext-neon.c b/media/libjpeg/simd/arm/jcgryext-neon.c
new file mode 100644
index 0000000000..416a7385df
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcgryext-neon.c
@@ -0,0 +1,106 @@
+/*
+ * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-neon.c */
+
+
+/* RGB -> Grayscale conversion is defined by the following equation:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * These constants are defined in jcgray-neon.c
+ *
+ * This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
+ */
+
+void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ JSAMPROW inptr;
+ JSAMPROW outptr;
+ /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+ ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr = output_buf[0][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining > 0; cols_remaining -= 16) {
+
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 16) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
+ if (cols_remaining < 16) {
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+ }
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
+ uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
+ uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
+ uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
+ y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
+ y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
+ y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
+ y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
+ y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
+ y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
+ y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
+ y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+
+ /* Narrow Y values to 8-bit and store to memory. Buffer overwrite is
+ * permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+
+ /* Increment pointers. */
+ inptr += (16 * RGB_PIXELSIZE);
+ outptr += 16;
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/jchuff.h b/media/libjpeg/simd/arm/jchuff.h
new file mode 100644
index 0000000000..2fbd252b9b
--- /dev/null
+++ b/media/libjpeg/simd/arm/jchuff.h
@@ -0,0 +1,131 @@
+/*
+ * jchuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2018, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020-2021, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/* Expanded entropy encoder object for Huffman encoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define BIT_BUF_SIZE 64
+#else
+#define BIT_BUF_SIZE 32
+#endif
+
+typedef struct {
+ size_t put_buffer; /* current bit accumulation buffer */
+ int free_bits; /* # of bits available in it */
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+ JOCTET *next_output_byte; /* => next byte to write in buffer */
+ size_t free_in_buffer; /* # of byte spaces remaining in buffer */
+ savable_state cur; /* Current bit buffer & DC state */
+ j_compress_ptr cinfo; /* dump_buffer needs access to this */
+ int simd;
+} working_state;
+
+/* Outputting bits to the file */
+
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
+ * as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
+ * 0xFF. Otherwise, the output buffer pointer is advanced by 1, and the
+ * speculative 0 byte will be overwritten by the next byte.
+ */
+#define EMIT_BYTE(b) { \
+ buffer[0] = (JOCTET)(b); \
+ buffer[1] = 0; \
+ buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write
+ * directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#define FLUSH() { \
+ if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+ EMIT_BYTE(put_buffer >> 56) \
+ EMIT_BYTE(put_buffer >> 48) \
+ EMIT_BYTE(put_buffer >> 40) \
+ EMIT_BYTE(put_buffer >> 32) \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ *((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
+ buffer += 8; \
+ } \
+}
+
+#else
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+ buffer[0] = (JOCTET)(put_buffer >> 24); \
+ buffer[1] = (JOCTET)(put_buffer >> 16); \
+ buffer[2] = (JOCTET)(put_buffer >> 8); \
+ buffer[3] = (JOCTET)(put_buffer ); \
+ buffer += 4; \
+}
+#else
+#define SPLAT() { \
+ put_buffer = __builtin_bswap32(put_buffer); \
+ __asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
+}
+#endif
+
+#define FLUSH() { \
+ if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ SPLAT(); \
+ } \
+}
+
+#endif
+
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+ put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+ FLUSH() \
+ free_bits += BIT_BUF_SIZE; \
+ put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+ free_bits -= size; \
+ if (free_bits < 0) \
+ PUT_AND_FLUSH(code, size) \
+ else \
+ put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size, diff) { \
+ diff |= code << nbits; \
+ nbits += size; \
+ PUT_BITS(diff, nbits) \
+}
diff --git a/media/libjpeg/simd/arm/jcphuff-neon.c b/media/libjpeg/simd/arm/jcphuff-neon.c
new file mode 100644
index 0000000000..51db3c5f39
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcphuff-neon.c
@@ -0,0 +1,623 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2022, Matthieu Darbois. All Rights Reserved.
+ * Copyright (C) 2022, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ UJCOEF *values, size_t *zerobits)
+{
+ UJCOEF *values_ptr = values;
+ UJCOEF *diff_values_ptr = values + DCTSIZE2;
+
+ /* Rows of coefficients to zero (since they haven't been processed) */
+ int i, rows_to_zero = 8;
+
+ for (i = 0; i < Sl / 16; i++) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+ /* Isolate sign of coefficients. */
+ uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
+ uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+ uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+ abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+ abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
+ uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_u16(values_ptr, abs_coefs1);
+ vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
+ vst1q_u16(diff_values_ptr, diff1);
+ vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
+ values_ptr += 16;
+ diff_values_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+ }
+
+ /* Same operation but for remaining partial vector */
+ int remaining_coefs = Sl % 16;
+ if (remaining_coefs > 8) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vdupq_n_s16(0);
+ switch (remaining_coefs) {
+ case 15:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+
+ /* Isolate sign of coefficients. */
+ uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
+ uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+ uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+ abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+ abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
+ uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_u16(values_ptr, abs_coefs1);
+ vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
+ vst1q_u16(diff_values_ptr, diff1);
+ vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
+ values_ptr += 16;
+ diff_values_ptr += 16;
+ rows_to_zero -= 2;
+
+ } else if (remaining_coefs > 0) {
+ int16x8_t coefs = vdupq_n_s16(0);
+
+ switch (remaining_coefs) {
+ case 8:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 7:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+
+ /* Isolate sign of coefficients. */
+ uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
+ abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_u16(values_ptr, abs_coefs);
+ vst1q_u16(diff_values_ptr, diff);
+ values_ptr += 8;
+ diff_values_ptr += 8;
+ rows_to_zero--;
+ }
+
+ /* Zero remaining memory in the values and diff_values blocks. */
+ for (i = 0; i < rows_to_zero; i++) {
+ vst1q_u16(values_ptr, vdupq_n_u16(0));
+ vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
+ values_ptr += 8;
+ diff_values_ptr += 8;
+ }
+
+ /* Construct zerobits bitmap. A set bit means that the corresponding
+ * coefficient != 0.
+ */
+ uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
+ uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
+ uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
+ uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
+ uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
+ uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
+ uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
+ uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
+
+ uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
+ uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
+ uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
+ uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
+ uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
+ uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
+ uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
+ uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
+
+ /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+ row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+ row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+ row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+ row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+ row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+ row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+ row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+ row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+ uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+ uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+ uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+ uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store zerobits bitmap. */
+ *zerobits = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store zerobits bitmap. */
+ zerobits[0] = ~bitmap0;
+ zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ UJCOEF *absvalues, size_t *bits)
+{
+ /* Temporary storage buffers for data used to compute the signbits bitmap and
+ * the end-of-block (EOB) position
+ */
+ uint8_t coef_sign_bits[64];
+ uint8_t coef_eq1_bits[64];
+
+ UJCOEF *absvalues_ptr = absvalues;
+ uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+ uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+ /* Rows of coefficients to zero (since they haven't been processed) */
+ int i, rows_to_zero = 8;
+
+ for (i = 0; i < Sl / 16; i++) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs1 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+ uint8x8_t sign_coefs2 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+ vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+ uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+ abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+ abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+ vst1q_u16(absvalues_ptr, abs_coefs1);
+ vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
+ uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq11);
+ vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+ absvalues_ptr += 16;
+ coef_sign_bits_ptr += 16;
+ eq1_bits_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+ }
+
+ /* Same operation but for remaining partial vector */
+ int remaining_coefs = Sl % 16;
+ if (remaining_coefs > 8) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vdupq_n_s16(0);
+ switch (remaining_coefs) {
+ case 15:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs1 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+ uint8x8_t sign_coefs2 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+ vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+ uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+ abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+ abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+ vst1q_u16(absvalues_ptr, abs_coefs1);
+ vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
+ uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq11);
+ vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+ absvalues_ptr += 16;
+ coef_sign_bits_ptr += 16;
+ eq1_bits_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+
+ } else if (remaining_coefs > 0) {
+ int16x8_t coefs = vdupq_n_s16(0);
+
+ switch (remaining_coefs) {
+ case 8:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 7:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
+ abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
+ vst1q_u16(absvalues_ptr, abs_coefs);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+ absvalues_ptr += 8;
+ coef_sign_bits_ptr += 8;
+ eq1_bits_ptr += 8;
+ rows_to_zero--;
+ }
+
+ /* Zero remaining memory in blocks. */
+ for (i = 0; i < rows_to_zero; i++) {
+ vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
+ vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+ vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+ absvalues_ptr += 8;
+ coef_sign_bits_ptr += 8;
+ eq1_bits_ptr += 8;
+ }
+
+ /* Construct zerobits bitmap. */
+ uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
+ uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
+ uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
+ uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
+ uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
+ uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
+ uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
+ uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
+
+ uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
+ uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
+ uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
+ uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
+ uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
+ uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
+ uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
+ uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
+
+ /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+ abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+ abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+ abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+ abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+ abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+ abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+ abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+ abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+ uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+ uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+ uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+ uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store zerobits bitmap. */
+ bits[0] = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store zerobits bitmap. */
+ bits[0] = ~bitmap0;
+ bits[1] = ~bitmap1;
+#endif
+
+ /* Construct signbits bitmap. */
+ uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+ uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+ uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+ uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+ uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+ uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+ uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+ uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+ signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+ signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+ signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+ signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+ signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+ signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+ signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+ signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+ bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+ bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+ bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+ bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+ bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store signbits bitmap. */
+ bits[1] = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store signbits bitmap. */
+ bits[2] = ~bitmap0;
+ bits[3] = ~bitmap1;
+#endif
+
+ /* Construct bitmap to find EOB position (the index of the last coefficient
+ * equal to 1.)
+ */
+ uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+ uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+ uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+ uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+ uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+ uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+ uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+ uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+ row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+ row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+ row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+ row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+ row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+ row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+ row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+ row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+ bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+ bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+ bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+ bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+ bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+ /* Return EOB position. */
+ if (bitmap == 0) {
+ /* EOB position is defined to be 0 if all coefficients != 1. */
+ return 0;
+ } else {
+ return 63 - BUILTIN_CLZLL(bitmap);
+ }
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+ /* Return EOB position. */
+ if (bitmap0 == 0 && bitmap1 == 0) {
+ return 0;
+ } else if (bitmap1 != 0) {
+ return 63 - BUILTIN_CLZ(bitmap1);
+ } else {
+ return 31 - BUILTIN_CLZ(bitmap0);
+ }
+#endif
+}
diff --git a/media/libjpeg/simd/arm/jcsample-neon.c b/media/libjpeg/simd/arm/jcsample-neon.c
new file mode 100644
index 0000000000..8a3e237838
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcsample-neon.c
@@ -0,0 +1,192 @@
+/*
+ * jcsample-neon.c - downsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */
+ 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */
+ 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */
+ 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */
+ 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+ 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+ 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ JSAMPROW inptr, outptr;
+ /* Load expansion mask to pad remaining elements of last DCT block. */
+ const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+ const uint8x16_t expand_mask =
+ vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+ /* Load bias pattern (alternating every pixel.) */
+ /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
+ const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
+ unsigned i, outrow;
+
+ for (outrow = 0; outrow < v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr = input_data[outrow];
+
+ /* Downsample all but the last DCT block of pixels. */
+ for (i = 0; i < width_in_blocks - 1; i++) {
+ uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
+ /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+ /* Divide total by 2 and narrow to 8-bit. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+ /* Store samples to memory. */
+ vst1_u8(outptr + i * DCTSIZE, samples_u8);
+ }
+
+ /* Load pixels in last DCT block into a table. */
+ uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Pad the empty elements with the value of the last pixel. */
+ pixels = vqtbl1q_u8(pixels, expand_mask);
+#else
+ uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
+ pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
+ vtbl2_u8(table, vget_high_u8(expand_mask)));
+#endif
+ /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+ /* Divide total by 2, narrow to 8-bit, and store. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+ vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+ }
+}
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ JSAMPROW inptr0, inptr1, outptr;
+ /* Load expansion mask to pad remaining elements of last DCT block. */
+ const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+ const uint8x16_t expand_mask =
+ vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+ /* Load bias pattern (alternating every pixel.) */
+ /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
+ const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
+ unsigned i, outrow;
+
+ for (outrow = 0; outrow < v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr0 = input_data[outrow];
+ inptr1 = input_data[outrow + 1];
+
+ /* Downsample all but the last DCT block of pixels. */
+ for (i = 0; i < width_in_blocks - 1; i++) {
+ uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
+ uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
+ /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+ /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
+ */
+ samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+ /* Divide total by 4 and narrow to 8-bit. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+ /* Store samples to memory and increment pointers. */
+ vst1_u8(outptr + i * DCTSIZE, samples_u8);
+ }
+
+ /* Load pixels in last DCT block into a table. */
+ uint8x16_t pixels_r0 =
+ vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+ uint8x16_t pixels_r1 =
+ vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Pad the empty elements with the value of the last pixel. */
+ pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
+ pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
+#else
+ uint8x8x2_t table_r0 =
+ { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
+ uint8x8x2_t table_r1 =
+ { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
+ pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
+ vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
+ pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
+ vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
+#endif
+ /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+ /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
+ samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+ /* Divide total by 4, narrow to 8-bit, and store. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+ vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+ }
+}
diff --git a/media/libjpeg/simd/arm/jdcolext-neon.c b/media/libjpeg/simd/arm/jdcolext-neon.c
new file mode 100644
index 0000000000..c3c07a1964
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdcolext-neon.c
@@ -0,0 +1,374 @@
+/*
+ * jdcolext-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-neon.c. */
+
+
+/* YCbCr -> RGB conversion is defined by the following equations:
+ * R = Y + 1.40200 * (Cr - 128)
+ * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ * B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.3441467 = 11277 * 2^-15
+ * 0.7141418 = 23401 * 2^-15
+ * 1.4020386 = 22971 * 2^-14
+ * 1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdcolor-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for YCbCr -> RGB conversion routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ JSAMPROW outptr;
+ /* Pointers to Y, Cb, and Cr data */
+ JSAMPROW inptr0, inptr1, inptr2;
+
+ const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+ const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ int cols_remaining = output_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+ uint8x16_t y = vld1q_u8(inptr0);
+ uint8x16_t cb = vld1q_u8(inptr1);
+ uint8x16_t cr = vld1q_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_low_u8(cr)));
+ int16x8_t cr_128_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_high_u8(cr)));
+ int16x8_t cb_128_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_low_u8(cb)));
+ int16x8_t cb_128_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_high_u8(cb)));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
+ int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
+ consts, 0);
+ int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
+ int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
+ consts, 0);
+ g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
+ consts, 1);
+ g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
+ consts, 1);
+ g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
+ consts, 1);
+ g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
+ consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
+ vrshrn_n_s32(g_sub_y_lh, 15));
+ int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
+ vrshrn_n_s32(g_sub_y_hh, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
+ consts, 2);
+ int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
+ consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
+ consts, 3);
+ int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
+ consts, 3);
+ /* Add Y. */
+ int16x8_t r_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
+ vget_low_u8(y)));
+ int16x8_t r_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
+ vget_high_u8(y)));
+ int16x8_t b_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
+ vget_low_u8(y)));
+ int16x8_t b_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
+ vget_high_u8(y)));
+ int16x8_t g_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
+ vget_low_u8(y)));
+ int16x8_t g_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
+ vget_high_u8(y)));
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t rgba;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+ rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+ rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4q_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+ uint8x16x3_t rgb;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+ rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+ rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+ /* Store RGB pixel data to memory. */
+ vst3q_u8(outptr, rgb);
+#else
+ /* Pack R, G, and B values in ratio 5:6:5. */
+ uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+ rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+ rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+ uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+ rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+ rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+ /* Store RGB pixel data to memory. */
+ vst1q_u16((uint16_t *)outptr, rgb565_l);
+ vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif
+
+ /* Increment pointers. */
+ inptr0 += 16;
+ inptr1 += 16;
+ inptr2 += 16;
+ outptr += (RGB_PIXELSIZE * 16);
+ }
+
+ if (cols_remaining >= 8) {
+ uint8x8_t y = vld1_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+ consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+ consts, 3);
+ /* Add Y. */
+ int16x8_t r =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+ int16x8_t b =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+ int16x8_t g =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t rgba;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgba.val[RGB_RED] = vqmovun_s16(r);
+ rgba.val[RGB_GREEN] = vqmovun_s16(g);
+ rgba.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+ uint8x8x3_t rgb;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgb.val[RGB_RED] = vqmovun_s16(r);
+ rgb.val[RGB_GREEN] = vqmovun_s16(g);
+ rgb.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Store RGB pixel data to memory. */
+ vst3_u8(outptr, rgb);
+#else
+ /* Pack R, G, and B values in ratio 5:6:5. */
+ uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+ /* Store RGB pixel data to memory. */
+ vst1q_u16((uint16_t *)outptr, rgb565);
+#endif
+
+ /* Increment pointers. */
+ inptr0 += 8;
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr += (RGB_PIXELSIZE * 8);
+ cols_remaining -= 8;
+ }
+
+ /* Handle the tail elements. */
+ if (cols_remaining > 0) {
+ uint8x8_t y = vld1_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+ consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+ consts, 3);
+ /* Add Y. */
+ int16x8_t r =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+ int16x8_t b =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+ int16x8_t g =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t rgba;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgba.val[RGB_RED] = vqmovun_s16(r);
+ rgba.val[RGB_GREEN] = vqmovun_s16(g);
+ rgba.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ switch (cols_remaining) {
+ case 7:
+ vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst4_lane_u8(outptr, rgba, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#elif RGB_PIXELSIZE == 3
+ uint8x8x3_t rgb;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgb.val[RGB_RED] = vqmovun_s16(r);
+ rgb.val[RGB_GREEN] = vqmovun_s16(g);
+ rgb.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Store RGB pixel data to memory. */
+ switch (cols_remaining) {
+ case 7:
+ vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst3_lane_u8(outptr, rgb, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#else
+ /* Pack R, G, and B values in ratio 5:6:5. */
+ uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+ /* Store RGB565 pixel data to memory. */
+ switch (cols_remaining) {
+ case 7:
+ vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#endif
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/jdcolor-neon.c b/media/libjpeg/simd/arm/jdcolor-neon.c
new file mode 100644
index 0000000000..28dbc57243
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdcolor-neon.c
@@ -0,0 +1,141 @@
+/*
+ * jdcolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+ -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgbx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgrx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion */
+
+#define RGB_PIXELSIZE 2
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
diff --git a/media/libjpeg/simd/arm/jdmerge-neon.c b/media/libjpeg/simd/arm/jdmerge-neon.c
new file mode 100644
index 0000000000..18fb9d8a55
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdmerge-neon.c
@@ -0,0 +1,144 @@
+/*
+ * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+ -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgbx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgbx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgrx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgrx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
diff --git a/media/libjpeg/simd/arm/jdmrgext-neon.c b/media/libjpeg/simd/arm/jdmrgext-neon.c
new file mode 100644
index 0000000000..5b89bdb339
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdmrgext-neon.c
@@ -0,0 +1,723 @@
+/*
+ * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-neon.c. */
+
+
+/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
+ * chroma upsampling and YCbCr -> RGB color conversion into a single function.
+ *
+ * As with the standalone functions, YCbCr -> RGB conversion is defined by the
+ * following equations:
+ * R = Y + 1.40200 * (Cr - 128)
+ * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ * B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.3441467 = 11277 * 2^-15
+ * 0.7141418 = 23401 * 2^-15
+ * 1.4020386 = 22971 * 2^-14
+ * 1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdmerge-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
+ * routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+ */
+
+void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW outptr;
+ /* Pointers to Y, Cb, and Cr data */
+ JSAMPROW inptr0, inptr1, inptr2;
+
+ const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+ const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+ inptr0 = input_buf[0][in_row_group_ctr];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ int cols_remaining = output_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+ /* De-interleave Y component values into two separate vectors, one
+ * containing the component values with even-numbered indices and one
+ * containing the component values with odd-numbered indices.
+ */
+ uint8x8x2_t y = vld2_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+ * "odd" Y component values. This effectively upsamples the chroma
+ * components horizontally.
+ */
+ int16x8_t g_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[0]));
+ int16x8_t r_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[0]));
+ int16x8_t b_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[0]));
+ int16x8_t g_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[1]));
+ int16x8_t r_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[1]));
+ int16x8_t b_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
+ uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+ uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+ uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+ uint8x16x4_t rgba;
+ rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+ rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+ rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4q_u8(outptr, rgba);
+#else
+ uint8x16x3_t rgb;
+ rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+ rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+ rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+ /* Store RGB pixel data to memory. */
+ vst3q_u8(outptr, rgb);
+#endif
+
+ /* Increment pointers. */
+ inptr0 += 16;
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr += (RGB_PIXELSIZE * 16);
+ }
+
+ if (cols_remaining > 0) {
+ /* De-interleave Y component values into two separate vectors, one
+ * containing the component values with even-numbered indices and one
+ * containing the component values with odd-numbered indices.
+ */
+ uint8x8x2_t y = vld2_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+ * "odd" Y component values. This effectively upsamples the chroma
+ * components horizontally.
+ */
+ int16x8_t g_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[0]));
+ int16x8_t r_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[0]));
+ int16x8_t b_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[0]));
+ int16x8_t g_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[1]));
+ int16x8_t r_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[1]));
+ int16x8_t b_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
+ uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+ uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+ uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+ uint8x8x4_t rgba_h;
+ rgba_h.val[RGB_RED] = r.val[1];
+ rgba_h.val[RGB_GREEN] = g.val[1];
+ rgba_h.val[RGB_BLUE] = b.val[1];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ uint8x8x4_t rgba_l;
+ rgba_l.val[RGB_RED] = r.val[0];
+ rgba_l.val[RGB_GREEN] = g.val[0];
+ rgba_l.val[RGB_BLUE] = b.val[0];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ switch (cols_remaining) {
+ case 15:
+ vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 8:
+ vst4_u8(outptr, rgba_l);
+ break;
+ case 7:
+ vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst4_lane_u8(outptr, rgba_l, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#else
+ uint8x8x3_t rgb_h;
+ rgb_h.val[RGB_RED] = r.val[1];
+ rgb_h.val[RGB_GREEN] = g.val[1];
+ rgb_h.val[RGB_BLUE] = b.val[1];
+ uint8x8x3_t rgb_l;
+ rgb_l.val[RGB_RED] = r.val[0];
+ rgb_l.val[RGB_GREEN] = g.val[0];
+ rgb_l.val[RGB_BLUE] = b.val[0];
+ /* Store RGB pixel data to memory. */
+ switch (cols_remaining) {
+ case 15:
+ vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 8:
+ vst3_u8(outptr, rgb_l);
+ break;
+ case 7:
+ vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst3_lane_u8(outptr, rgb_l, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#endif
+ }
+}
+
+
+/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+ *
+ * See comments above for details regarding color conversion and safe memory
+ * access.
+ */
+
+void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW outptr0, outptr1;
+ /* Pointers to Y (both rows), Cb, and Cr data */
+ JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
+
+ const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+ const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+ inptr0_0 = input_buf[0][in_row_group_ctr * 2];
+ inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr0 = output_buf[0];
+ outptr1 = output_buf[1];
+
+ int cols_remaining = output_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+ /* For each row, de-interleave Y component values into two separate
+ * vectors, one containing the component values with even-numbered indices
+ * and one containing the component values with odd-numbered indices.
+ */
+ uint8x8x2_t y0 = vld2_u8(inptr0_0);
+ uint8x8x2_t y1 = vld2_u8(inptr0_1);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+ * the "even" and "odd" Y component values. This effectively upsamples the
+ * chroma components both horizontally and vertically.
+ */
+ int16x8_t g0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[0]));
+ int16x8_t r0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[0]));
+ int16x8_t b0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[0]));
+ int16x8_t g0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[1]));
+ int16x8_t r0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[1]));
+ int16x8_t b0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[1]));
+ int16x8_t g1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[0]));
+ int16x8_t r1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[0]));
+ int16x8_t b1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[0]));
+ int16x8_t g1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[1]));
+ int16x8_t r1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[1]));
+ int16x8_t b1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
+ uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+ uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+ uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+ uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+ uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+ uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+ uint8x16x4_t rgba0, rgba1;
+ rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+ rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+ rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+ rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+ rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+ rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4q_u8(outptr0, rgba0);
+ vst4q_u8(outptr1, rgba1);
+#else
+ uint8x16x3_t rgb0, rgb1;
+ rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+ rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+ rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+ rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+ rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+ rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+ /* Store RGB pixel data to memory. */
+ vst3q_u8(outptr0, rgb0);
+ vst3q_u8(outptr1, rgb1);
+#endif
+
+ /* Increment pointers. */
+ inptr0_0 += 16;
+ inptr0_1 += 16;
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr0 += (RGB_PIXELSIZE * 16);
+ outptr1 += (RGB_PIXELSIZE * 16);
+ }
+
+ if (cols_remaining > 0) {
+ /* For each row, de-interleave Y component values into two separate
+ * vectors, one containing the component values with even-numbered indices
+ * and one containing the component values with odd-numbered indices.
+ */
+ uint8x8x2_t y0 = vld2_u8(inptr0_0);
+ uint8x8x2_t y1 = vld2_u8(inptr0_1);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+ * the "even" and "odd" Y component values. This effectively upsamples the
+ * chroma components both horizontally and vertically.
+ */
+ int16x8_t g0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[0]));
+ int16x8_t r0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[0]));
+ int16x8_t b0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[0]));
+ int16x8_t g0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[1]));
+ int16x8_t r0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[1]));
+ int16x8_t b0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[1]));
+ int16x8_t g1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[0]));
+ int16x8_t r1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[0]));
+ int16x8_t b1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[0]));
+ int16x8_t g1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[1]));
+ int16x8_t r1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[1]));
+ int16x8_t b1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
+ uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+ uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+ uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+ uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+ uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+ uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+ uint8x8x4_t rgba0_h, rgba1_h;
+ rgba0_h.val[RGB_RED] = r0.val[1];
+ rgba1_h.val[RGB_RED] = r1.val[1];
+ rgba0_h.val[RGB_GREEN] = g0.val[1];
+ rgba1_h.val[RGB_GREEN] = g1.val[1];
+ rgba0_h.val[RGB_BLUE] = b0.val[1];
+ rgba1_h.val[RGB_BLUE] = b1.val[1];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+
+ uint8x8x4_t rgba0_l, rgba1_l;
+ rgba0_l.val[RGB_RED] = r0.val[0];
+ rgba1_l.val[RGB_RED] = r1.val[0];
+ rgba0_l.val[RGB_GREEN] = g0.val[0];
+ rgba1_l.val[RGB_GREEN] = g1.val[0];
+ rgba0_l.val[RGB_BLUE] = b0.val[0];
+ rgba1_l.val[RGB_BLUE] = b1.val[0];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ switch (cols_remaining) {
+ case 15:
+ vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
+ vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
+ vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
+ vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
+ vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
+ vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
+ vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
+ vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 8:
+ vst4_u8(outptr0, rgba0_l);
+ vst4_u8(outptr1, rgba1_l);
+ break;
+ case 7:
+ vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
+ vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
+ vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
+ vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
+ vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
+ vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
+ vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst4_lane_u8(outptr0, rgba0_l, 0);
+ vst4_lane_u8(outptr1, rgba1_l, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#else
+ uint8x8x3_t rgb0_h, rgb1_h;
+ rgb0_h.val[RGB_RED] = r0.val[1];
+ rgb1_h.val[RGB_RED] = r1.val[1];
+ rgb0_h.val[RGB_GREEN] = g0.val[1];
+ rgb1_h.val[RGB_GREEN] = g1.val[1];
+ rgb0_h.val[RGB_BLUE] = b0.val[1];
+ rgb1_h.val[RGB_BLUE] = b1.val[1];
+
+ uint8x8x3_t rgb0_l, rgb1_l;
+ rgb0_l.val[RGB_RED] = r0.val[0];
+ rgb1_l.val[RGB_RED] = r1.val[0];
+ rgb0_l.val[RGB_GREEN] = g0.val[0];
+ rgb1_l.val[RGB_GREEN] = g1.val[0];
+ rgb0_l.val[RGB_BLUE] = b0.val[0];
+ rgb1_l.val[RGB_BLUE] = b1.val[0];
+ /* Store RGB pixel data to memory. */
+ switch (cols_remaining) {
+ case 15:
+ vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
+ vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
+ vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
+ vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
+ vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
+ vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
+ vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
+ vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 8:
+ vst3_u8(outptr0, rgb0_l);
+ vst3_u8(outptr1, rgb1_l);
+ break;
+ case 7:
+ vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
+ vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
+ vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
+ vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
+ vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
+ vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
+ vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst3_lane_u8(outptr0, rgb0_l, 0);
+ vst3_lane_u8(outptr1, rgb1_l, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#endif
+ }
+}
diff --git a/media/libjpeg/simd/arm/jdsample-neon.c b/media/libjpeg/simd/arm/jdsample-neon.c
new file mode 100644
index 0000000000..90ec6782c4
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdsample-neon.c
@@ -0,0 +1,569 @@
+/*
+ * jdsample-neon.c - upsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ * s0 s1 s2
+ * +---------+---------+---------+
+ * | | | |
+ * | p0 p1 | p2 p3 | p4 p5 |
+ * | | | |
+ * +---------+---------+---------+
+ *
+ * Samples s0-s2 were created by averaging the original pixel component values
+ * centered at positions p0-p5 above. To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each row.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1. For example:
+ * p1(upsampled) = 3/4 * s0 + 1/4 * s1
+ * p2(upsampled) = 3/4 * s1 + 1/4 * s0
+ * When computing the first and last pixel component values in the row, there
+ * is no adjacent sample to blend, so:
+ * p0(upsampled) = s0
+ * p5(upsampled) = s2
+ */
+
+void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t one_u16 = vdupq_n_u16(1);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+ /* First pixel component value in this row of the original image */
+ *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
+
+ /* 3/4 * containing sample + 1/4 * nearest neighboring sample
+ * For p1: containing sample = s0, nearest neighboring sample = s1
+ * For p2: containing sample = s1, nearest neighboring sample = s0
+ */
+ uint8x16_t s0 = vld1q_u8(inptr);
+ uint8x16_t s1 = vld1q_u8(inptr + 1);
+ /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
+ * denote low half and high half respectively.
+ */
+ uint16x8_t s1_add_3s0_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+ uint16x8_t s1_add_3s0_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+ uint16x8_t s0_add_3s1_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+ uint16x8_t s0_add_3s1_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+ /* Add ordered dithering bias to odd pixel values. */
+ s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+ s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+ /* The offset is initially 1, because the first pixel component has already
+ * been stored. However, in subsequent iterations of the SIMD loop, this
+ * offset is (2 * colctr - 1) to stay within the bounds of the sample
+ * buffers without having to resort to a slow scalar tail case for the last
+ * (downsampled_width % 16) samples. See "Creation of 2-D sample arrays"
+ * in jmemmgr.c for more details.
+ */
+ unsigned outptr_offset = 1;
+ uint8x16x2_t output_pixels;
+
+ /* We use software pipelining to maximise performance. The code indented
+ * an extra two spaces begins the next iteration of the loop.
+ */
+ for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+
+ s0 = vld1q_u8(inptr + colctr - 1);
+ s1 = vld1q_u8(inptr + colctr);
+
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+ vrshrn_n_u16(s1_add_3s0_h, 2));
+ output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+ vshrn_n_u16(s0_add_3s1_h, 2));
+
+ /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
+ * denote low half and high half respectively.
+ */
+ s1_add_3s0_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+ s1_add_3s0_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+ s0_add_3s1_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+ s0_add_3s1_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+ /* Add ordered dithering bias to odd pixel values. */
+ s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+ s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+ /* Store pixel component values to memory. */
+ vst2q_u8(outptr + outptr_offset, output_pixels);
+ outptr_offset = 2 * colctr - 1;
+ }
+
+ /* Complete the last iteration of the loop. */
+
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+ vrshrn_n_u16(s1_add_3s0_h, 2));
+ output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+ vshrn_n_u16(s0_add_3s1_h, 2));
+ /* Store pixel component values to memory. */
+ vst2q_u8(outptr + outptr_offset, output_pixels);
+
+ /* Last pixel component value in this row of the original image */
+ outptr[2 * downsampled_width - 1] =
+ GETJSAMPLE(inptr[downsampled_width - 1]);
+ }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ * s0 s1 s2
+ * +---------+---------+---------+
+ * | p0 p1 | p2 p3 | p4 p5 |
+ * sA | | | |
+ * | p6 p7 | p8 p9 | p10 p11|
+ * +---------+---------+---------+
+ * | p12 p13| p14 p15| p16 p17|
+ * sB | | | |
+ * | p18 p19| p20 p21| p22 p23|
+ * +---------+---------+---------+
+ * | p24 p25| p26 p27| p28 p29|
+ * sC | | | |
+ * | p30 p31| p32 p33| p34 p35|
+ * +---------+---------+---------+
+ *
+ * Samples s0A-s2C were created by averaging the original pixel component
+ * values centered at positions p0-p35 above. To approximate one of those
+ * original pixel component values, we proportionally blend the sample
+ * containing the pixel center with the nearest neighboring samples in each
+ * row, column, and diagonal.
+ *
+ * An upsampled pixel component value is computed by first blending the sample
+ * containing the pixel center with the nearest neighboring samples in the
+ * same column, in the ratio 3:1, and then blending each column sum with the
+ * nearest neighboring column sum, in the ratio 3:1. For example:
+ * p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
+ * 1/4 * (3/4 * s0B + 1/4 * s0A)
+ * = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
+ * When computing the first and last pixel component values in the row, there
+ * is no horizontally adjacent sample to blend, so:
+ * p12(upsampled) = 3/4 * s0B + 1/4 * s0A
+ * p23(upsampled) = 3/4 * s2B + 1/4 * s2C
+ * When computing the first and last pixel component values in the column,
+ * there is no vertically adjacent sample to blend, so:
+ * p2(upsampled) = 3/4 * s1A + 1/4 * s0A
+ * p33(upsampled) = 3/4 * s1C + 1/4 * s2C
+ * When computing the corner pixel component values, there is no adjacent
+ * sample to blend, so:
+ * p0(upsampled) = s0A
+ * p35(upsampled) = s2C
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t seven_u16 = vdupq_n_u16(7);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+ const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+ inrow = outrow = 0;
+ while (outrow < max_v_samp_factor) {
+ inptr0 = input_data[inrow - 1];
+ inptr1 = input_data[inrow];
+ inptr2 = input_data[inrow + 1];
+ /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+ * respectively.
+ */
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ /* First pixel component value in this row of the original image */
+ int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+ *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+ int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+ *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+ /* Step 1: Blend samples vertically in columns s0 and s1.
+ * Leave the divide by 4 until the end, when it can be done for both
+ * dimensions at once, right-shifting by 4.
+ */
+
+ /* Load and compute s0colsum0 and s0colsum1. */
+ uint8x16_t s0A = vld1q_u8(inptr0);
+ uint8x16_t s0B = vld1q_u8(inptr1);
+ uint8x16_t s0C = vld1q_u8(inptr2);
+ /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
+ * denote low half and high half respectively.
+ */
+ uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
+ vget_low_u8(s0B), three_u8);
+ uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
+ vget_high_u8(s0B), three_u8);
+ uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
+ vget_low_u8(s0B), three_u8);
+ uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
+ vget_high_u8(s0B), three_u8);
+ /* Load and compute s1colsum0 and s1colsum1. */
+ uint8x16_t s1A = vld1q_u8(inptr0 + 1);
+ uint8x16_t s1B = vld1q_u8(inptr1 + 1);
+ uint8x16_t s1C = vld1q_u8(inptr2 + 1);
+ uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
+ vget_low_u8(s1B), three_u8);
+ uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
+ vget_high_u8(s1B), three_u8);
+ uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
+ vget_low_u8(s1B), three_u8);
+ uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
+ vget_high_u8(s1B), three_u8);
+
+ /* Step 2: Blend the already-blended columns. */
+
+ uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+ uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+ uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+ uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+ uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+ uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+ uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+ uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+ /* Add ordered dithering bias to odd pixel values. */
+ output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+ output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+ output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+ output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+ /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+ uint8x16x2_t output_pixels0 = { {
+ vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
+ vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
+ } };
+ uint8x16x2_t output_pixels1 = { {
+ vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
+ vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
+ } };
+
+ /* Store pixel component values to memory.
+ * The minimum size of the output buffer for each row is 64 bytes => no
+ * need to worry about buffer overflow here. See "Creation of 2-D sample
+ * arrays" in jmemmgr.c for more details.
+ */
+ vst2q_u8(outptr0 + 1, output_pixels0);
+ vst2q_u8(outptr1 + 1, output_pixels1);
+
+ /* The first pixel of the image shifted our loads and stores by one byte.
+ * We have to re-align on a 32-byte boundary at some point before the end
+ * of the row (we do it now on the 32/33 pixel boundary) to stay within the
+ * bounds of the sample buffers without having to resort to a slow scalar
+ * tail case for the last (downsampled_width % 16) samples. See "Creation
+ * of 2-D sample arrays" in jmemmgr.c for more details.
+ */
+ for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+ /* Step 1: Blend samples vertically in columns s0 and s1. */
+
+ /* Load and compute s0colsum0 and s0colsum1. */
+ s0A = vld1q_u8(inptr0 + colctr - 1);
+ s0B = vld1q_u8(inptr1 + colctr - 1);
+ s0C = vld1q_u8(inptr2 + colctr - 1);
+ s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
+ three_u8);
+ s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
+ three_u8);
+ s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
+ three_u8);
+ s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
+ three_u8);
+ /* Load and compute s1colsum0 and s1colsum1. */
+ s1A = vld1q_u8(inptr0 + colctr);
+ s1B = vld1q_u8(inptr1 + colctr);
+ s1C = vld1q_u8(inptr2 + colctr);
+ s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
+ three_u8);
+ s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
+ three_u8);
+ s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
+ three_u8);
+ s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
+ three_u8);
+
+ /* Step 2: Blend the already-blended columns. */
+
+ output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+ output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+ output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+ output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+ output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+ output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+ output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+ output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+ /* Add ordered dithering bias to odd pixel values. */
+ output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+ output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+ output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+ output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+ /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+ output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+ vshrn_n_u16(output0_p1_h, 4));
+ output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+ vrshrn_n_u16(output0_p2_h, 4));
+ output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+ vshrn_n_u16(output1_p1_h, 4));
+ output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+ vrshrn_n_u16(output1_p2_h, 4));
+ /* Store pixel component values to memory. */
+ vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+ vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+ }
+
+ /* Last pixel component value in this row of the original image */
+ int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+ GETJSAMPLE(inptr0[downsampled_width - 1]);
+ outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+ int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+ GETJSAMPLE(inptr2[downsampled_width - 1]);
+ outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+ inrow++;
+ }
+}
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ * +---------+
+ * | p0 |
+ * sA | |
+ * | p1 |
+ * +---------+
+ * | p2 |
+ * sB | |
+ * | p3 |
+ * +---------+
+ * | p4 |
+ * sC | |
+ * | p5 |
+ * +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above. To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1. For example:
+ * p1(upsampled) = 3/4 * sA + 1/4 * sB
+ * p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ * p0(upsampled) = sA
+ * p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t one_u16 = vdupq_n_u16(1);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+
+ inrow = outrow = 0;
+ while (outrow < max_v_samp_factor) {
+ inptr0 = input_data[inrow - 1];
+ inptr1 = input_data[inrow];
+ inptr2 = input_data[inrow + 1];
+ /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+ * respectively.
+ */
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+ inrow++;
+
+ /* The size of the input and output buffers is always a multiple of 32
+ * bytes => no need to worry about buffer overflow when reading/writing
+ * memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
+ * details.
+ */
+ for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+ /* Load samples. */
+ uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+ uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+ uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+ /* Blend samples vertically. */
+ uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+ vget_low_u8(sB), three_u8);
+ uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+ vget_high_u8(sB), three_u8);
+ uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+ vget_low_u8(sB), three_u8);
+ uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+ vget_high_u8(sB), three_u8);
+ /* Add ordered dithering bias to pixel values in even output rows. */
+ colsum0_l = vaddq_u16(colsum0_l, one_u16);
+ colsum0_h = vaddq_u16(colsum0_h, one_u16);
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+ vshrn_n_u16(colsum0_h, 2));
+ uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+ vrshrn_n_u16(colsum1_h, 2));
+ /* Store pixel component values to memory. */
+ vst1q_u8(outptr0 + colctr, output_pixels0);
+ vst1q_u8(outptr1 + colctr, output_pixels1);
+ }
+ }
+}
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ * s0 s1
+ * +---------+---------+
+ * | | |
+ * | p0 p1 | p2 p3 |
+ * | | |
+ * +---------+---------+
+ *
+ * Samples s0 and s1 were created by averaging the original pixel component
+ * values centered at positions p0-p3 above. To approximate those original
+ * pixel component values, we duplicate the samples horizontally:
+ * p0(upsampled) = p1(upsampled) = s0
+ * p2(upsampled) = p3(upsampled) = s1
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow;
+ unsigned colctr;
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+ for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+ uint8x16_t samples = vld1q_u8(inptr + colctr);
+ /* Duplicate the samples. The store operation below interleaves them so
+ * that adjacent pixel component values take on the same sample value,
+ * per above.
+ */
+ uint8x16x2_t output_pixels = { { samples, samples } };
+ /* Store pixel component values to memory.
+ * Due to the way sample buffers are allocated, we don't need to worry
+ * about tail cases when output_width is not a multiple of 32. See
+ * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+ */
+ vst2q_u8(outptr + 2 * colctr, output_pixels);
+ }
+ }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ * s0 s1
+ * +---------+---------+
+ * | p0 p1 | p2 p3 |
+ * sA | | |
+ * | p4 p5 | p6 p7 |
+ * +---------+---------+
+ * | p8 p9 | p10 p11|
+ * sB | | |
+ * | p12 p13| p14 p15|
+ * +---------+---------+
+ *
+ * Samples s0A-s1B were created by averaging the original pixel component
+ * values centered at positions p0-p15 above. To approximate those original
+ * pixel component values, we duplicate the samples both horizontally and
+ * vertically:
+ * p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
+ * p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
+ * p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
+ * p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+
+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+ uint8x16_t samples = vld1q_u8(inptr + colctr);
+ /* Duplicate the samples. The store operation below interleaves them so
+ * that adjacent pixel component values take on the same sample value,
+ * per above.
+ */
+ uint8x16x2_t output_pixels = { { samples, samples } };
+ /* Store pixel component values for both output rows to memory.
+ * Due to the way sample buffers are allocated, we don't need to worry
+ * about tail cases when output_width is not a multiple of 32. See
+ * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+ */
+ vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+ vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/jfdctfst-neon.c b/media/libjpeg/simd/arm/jfdctfst-neon.c
new file mode 100644
index 0000000000..bb371be399
--- /dev/null
+++ b/media/libjpeg/simd/arm/jfdctfst-neon.c
@@ -0,0 +1,214 @@
+/*
+ * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples. It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.382683433 = 12544 * 2^-15
+ * 0.541196100 = 17795 * 2^-15
+ * 0.707106781 = 23168 * 2^-15
+ * 0.306562965 = 9984 * 2^-15
+ *
+ * See jfdctfst.c for further details of the DCT algorithm. Where possible,
+ * the variable names and comments here in jsimd_fdct_ifast_neon() match up
+ * with those in jpeg_fdct_ifast().
+ */
+
+#define F_0_382 12544
+#define F_0_541 17792
+#define F_0_707 23168
+#define F_0_306 9984
+
+
+ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+ F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+void jsimd_fdct_ifast_neon(DCTELEM *data)
+{
+ /* Load an 8x8 block of samples into Neon registers. De-interleaving loads
+ * are used, followed by vuzp to transpose the block such that we have a
+ * column of samples per vector - allowing all rows to be processed at once.
+ */
+ int16x8x4_t data1 = vld4q_s16(data);
+ int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+ int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+ int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+ int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+ int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+ int16x8_t col0 = cols_04.val[0];
+ int16x8_t col1 = cols_15.val[0];
+ int16x8_t col2 = cols_26.val[0];
+ int16x8_t col3 = cols_37.val[0];
+ int16x8_t col4 = cols_04.val[1];
+ int16x8_t col5 = cols_15.val[1];
+ int16x8_t col6 = cols_26.val[1];
+ int16x8_t col7 = cols_37.val[1];
+
+ /* Pass 1: process rows. */
+
+ /* Load DCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+ int16x8_t tmp0 = vaddq_s16(col0, col7);
+ int16x8_t tmp7 = vsubq_s16(col0, col7);
+ int16x8_t tmp1 = vaddq_s16(col1, col6);
+ int16x8_t tmp6 = vsubq_s16(col1, col6);
+ int16x8_t tmp2 = vaddq_s16(col2, col5);
+ int16x8_t tmp5 = vsubq_s16(col2, col5);
+ int16x8_t tmp3 = vaddq_s16(col3, col4);
+ int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+ /* Even part */
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
+ int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+ int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+ int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+ col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
+ col4 = vsubq_s16(tmp10, tmp11);
+
+ int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+ col2 = vaddq_s16(tmp13, z1); /* phase 5 */
+ col6 = vsubq_s16(tmp13, z1);
+
+ /* Odd part */
+ tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
+ tmp11 = vaddq_s16(tmp5, tmp6);
+ tmp12 = vaddq_s16(tmp6, tmp7);
+
+ int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+ int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+ z2 = vaddq_s16(z2, z5);
+ int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+ z5 = vaddq_s16(tmp12, z5);
+ z4 = vaddq_s16(z4, z5);
+ int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+ int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */
+ int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+ col5 = vaddq_s16(z13, z2); /* phase 6 */
+ col3 = vsubq_s16(z13, z2);
+ col1 = vaddq_s16(z11, z4);
+ col7 = vsubq_s16(z11, z4);
+
+ /* Transpose to work on columns in pass 2. */
+ int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+ int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+ int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+ int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+ int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+ vreinterpretq_s32_s16(cols_45.val[0]));
+ int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+ vreinterpretq_s32_s16(cols_45.val[1]));
+ int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+ vreinterpretq_s32_s16(cols_67.val[0]));
+ int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+ vreinterpretq_s32_s16(cols_67.val[1]));
+
+ int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+ int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+ int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+ int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+ int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+ int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+ int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+ int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+ int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+ int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+ int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+ int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+ /* Pass 2: process columns. */
+
+ tmp0 = vaddq_s16(row0, row7);
+ tmp7 = vsubq_s16(row0, row7);
+ tmp1 = vaddq_s16(row1, row6);
+ tmp6 = vsubq_s16(row1, row6);
+ tmp2 = vaddq_s16(row2, row5);
+ tmp5 = vsubq_s16(row2, row5);
+ tmp3 = vaddq_s16(row3, row4);
+ tmp4 = vsubq_s16(row3, row4);
+
+ /* Even part */
+ tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
+ tmp13 = vsubq_s16(tmp0, tmp3);
+ tmp11 = vaddq_s16(tmp1, tmp2);
+ tmp12 = vsubq_s16(tmp1, tmp2);
+
+ row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
+ row4 = vsubq_s16(tmp10, tmp11);
+
+ z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+ row2 = vaddq_s16(tmp13, z1); /* phase 5 */
+ row6 = vsubq_s16(tmp13, z1);
+
+ /* Odd part */
+ tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
+ tmp11 = vaddq_s16(tmp5, tmp6);
+ tmp12 = vaddq_s16(tmp6, tmp7);
+
+ z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+ z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+ z2 = vaddq_s16(z2, z5);
+ z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+ z5 = vaddq_s16(tmp12, z5);
+ z4 = vaddq_s16(z4, z5);
+ z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+ z11 = vaddq_s16(tmp7, z3); /* phase 5 */
+ z13 = vsubq_s16(tmp7, z3);
+
+ row5 = vaddq_s16(z13, z2); /* phase 6 */
+ row3 = vsubq_s16(z13, z2);
+ row1 = vaddq_s16(z11, z4);
+ row7 = vsubq_s16(z11, z4);
+
+ vst1q_s16(data + 0 * DCTSIZE, row0);
+ vst1q_s16(data + 1 * DCTSIZE, row1);
+ vst1q_s16(data + 2 * DCTSIZE, row2);
+ vst1q_s16(data + 3 * DCTSIZE, row3);
+ vst1q_s16(data + 4 * DCTSIZE, row4);
+ vst1q_s16(data + 5 * DCTSIZE, row5);
+ vst1q_s16(data + 6 * DCTSIZE, row6);
+ vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/media/libjpeg/simd/arm/jfdctint-neon.c b/media/libjpeg/simd/arm/jfdctint-neon.c
new file mode 100644
index 0000000000..ccfc07b15d
--- /dev/null
+++ b/media/libjpeg/simd/arm/jfdctint-neon.c
@@ -0,0 +1,376 @@
+/*
+ * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples. It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_islow() function, which can be found in jfdctint.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.298631336 = 2446 * 2^-13
+ * 0.390180644 = 3196 * 2^-13
+ * 0.541196100 = 4433 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.175875602 = 9633 * 2^-13
+ * 1.501321110 = 12299 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 1.961570560 = 16069 * 2^-13
+ * 2.053119869 = 16819 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ * 3.072711026 = 25172 * 2^-13
+ *
+ * See jfdctint.c for further details of the DCT algorithm. Where possible,
+ * the variable names and comments here in jsimd_fdct_islow_neon() match up
+ * with those in jpeg_fdct_islow().
+ */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+#define F_0_298 2446
+#define F_0_390 3196
+#define F_0_541 4433
+#define F_0_765 6270
+#define F_0_899 7373
+#define F_1_175 9633
+#define F_1_501 12299
+#define F_1_847 15137
+#define F_1_961 16069
+#define F_2_053 16819
+#define F_2_562 20995
+#define F_3_072 25172
+
+
+ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
+ F_0_298, -F_0_390, F_0_541, F_0_765,
+ -F_0_899, F_1_175, F_1_501, -F_1_847,
+ -F_1_961, F_2_053, -F_2_562, F_3_072
+};
+
+void jsimd_fdct_islow_neon(DCTELEM *data)
+{
+ /* Load DCT constants. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+ const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Load an 8x8 block of samples into Neon registers. De-interleaving loads
+ * are used, followed by vuzp to transpose the block such that we have a
+ * column of samples per vector - allowing all rows to be processed at once.
+ */
+ int16x8x4_t s_rows_0123 = vld4q_s16(data);
+ int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
+
+ int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
+ int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
+ int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
+ int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
+
+ int16x8_t col0 = cols_04.val[0];
+ int16x8_t col1 = cols_15.val[0];
+ int16x8_t col2 = cols_26.val[0];
+ int16x8_t col3 = cols_37.val[0];
+ int16x8_t col4 = cols_04.val[1];
+ int16x8_t col5 = cols_15.val[1];
+ int16x8_t col6 = cols_26.val[1];
+ int16x8_t col7 = cols_37.val[1];
+
+ /* Pass 1: process rows. */
+
+ int16x8_t tmp0 = vaddq_s16(col0, col7);
+ int16x8_t tmp7 = vsubq_s16(col0, col7);
+ int16x8_t tmp1 = vaddq_s16(col1, col6);
+ int16x8_t tmp6 = vsubq_s16(col1, col6);
+ int16x8_t tmp2 = vaddq_s16(col2, col5);
+ int16x8_t tmp5 = vsubq_s16(col2, col5);
+ int16x8_t tmp3 = vaddq_s16(col3, col4);
+ int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+ /* Even part */
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
+ int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+ int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+ int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+ col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+ col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+ int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+ int32x4_t z1_l =
+ vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+ int32x4_t z1_h =
+ vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+ int32x4_t col2_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+ int32x4_t col2_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+ col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
+ vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
+
+ int32x4_t col6_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+ int32x4_t col6_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+ col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
+ vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
+
+ /* Odd part */
+ int16x8_t z1 = vaddq_s16(tmp4, tmp7);
+ int16x8_t z2 = vaddq_s16(tmp5, tmp6);
+ int16x8_t z3 = vaddq_s16(tmp4, tmp6);
+ int16x8_t z4 = vaddq_s16(tmp5, tmp7);
+ /* sqrt(2) * c3 */
+ int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+ int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+ z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+ z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+ /* sqrt(2) * (-c1+c3+c5-c7) */
+ int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+ int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+ /* sqrt(2) * ( c1+c3-c5+c7) */
+ int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+ int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+ /* sqrt(2) * ( c1+c3+c5-c7) */
+ int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+ int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+ /* sqrt(2) * ( c1+c3-c5-c7) */
+ int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+ int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+ /* sqrt(2) * (c7-c3) */
+ z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+ z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+ /* sqrt(2) * (-c1-c3) */
+ int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+ int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+ /* sqrt(2) * (-c3-c5) */
+ int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+ int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+ /* sqrt(2) * (c5-c3) */
+ int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+ int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+ z3_l = vaddq_s32(z3_l, z5_l);
+ z3_h = vaddq_s32(z3_h, z5_h);
+ z4_l = vaddq_s32(z4_l, z5_l);
+ z4_h = vaddq_s32(z4_h, z5_h);
+
+ tmp4_l = vaddq_s32(tmp4_l, z1_l);
+ tmp4_h = vaddq_s32(tmp4_h, z1_h);
+ tmp4_l = vaddq_s32(tmp4_l, z3_l);
+ tmp4_h = vaddq_s32(tmp4_h, z3_h);
+ col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
+ vrshrn_n_s32(tmp4_h, DESCALE_P1));
+
+ tmp5_l = vaddq_s32(tmp5_l, z2_l);
+ tmp5_h = vaddq_s32(tmp5_h, z2_h);
+ tmp5_l = vaddq_s32(tmp5_l, z4_l);
+ tmp5_h = vaddq_s32(tmp5_h, z4_h);
+ col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
+ vrshrn_n_s32(tmp5_h, DESCALE_P1));
+
+ tmp6_l = vaddq_s32(tmp6_l, z2_l);
+ tmp6_h = vaddq_s32(tmp6_h, z2_h);
+ tmp6_l = vaddq_s32(tmp6_l, z3_l);
+ tmp6_h = vaddq_s32(tmp6_h, z3_h);
+ col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
+ vrshrn_n_s32(tmp6_h, DESCALE_P1));
+
+ tmp7_l = vaddq_s32(tmp7_l, z1_l);
+ tmp7_h = vaddq_s32(tmp7_h, z1_h);
+ tmp7_l = vaddq_s32(tmp7_l, z4_l);
+ tmp7_h = vaddq_s32(tmp7_h, z4_h);
+ col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
+ vrshrn_n_s32(tmp7_h, DESCALE_P1));
+
+ /* Transpose to work on columns in pass 2. */
+ int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+ int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+ int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+ int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+ int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+ vreinterpretq_s32_s16(cols_45.val[0]));
+ int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+ vreinterpretq_s32_s16(cols_45.val[1]));
+ int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+ vreinterpretq_s32_s16(cols_67.val[0]));
+ int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+ vreinterpretq_s32_s16(cols_67.val[1]));
+
+ int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+ int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+ int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+ int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+ int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+ int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+ int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+ int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+ int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+ int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+ int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+ int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+ /* Pass 2: process columns. */
+
+ tmp0 = vaddq_s16(row0, row7);
+ tmp7 = vsubq_s16(row0, row7);
+ tmp1 = vaddq_s16(row1, row6);
+ tmp6 = vsubq_s16(row1, row6);
+ tmp2 = vaddq_s16(row2, row5);
+ tmp5 = vsubq_s16(row2, row5);
+ tmp3 = vaddq_s16(row3, row4);
+ tmp4 = vsubq_s16(row3, row4);
+
+ /* Even part */
+ tmp10 = vaddq_s16(tmp0, tmp3);
+ tmp13 = vsubq_s16(tmp0, tmp3);
+ tmp11 = vaddq_s16(tmp1, tmp2);
+ tmp12 = vsubq_s16(tmp1, tmp2);
+
+ row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+ row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+ tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+ z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+ z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+ int32x4_t row2_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+ int32x4_t row2_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+ row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
+ vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
+
+ int32x4_t row6_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+ int32x4_t row6_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+ row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
+ vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
+
+ /* Odd part */
+ z1 = vaddq_s16(tmp4, tmp7);
+ z2 = vaddq_s16(tmp5, tmp6);
+ z3 = vaddq_s16(tmp4, tmp6);
+ z4 = vaddq_s16(tmp5, tmp7);
+ /* sqrt(2) * c3 */
+ z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+ z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+ z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+ z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+ /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+ tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+ /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+ tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+ /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+ tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+ /* sqrt(2) * ( c1+c3-c5-c7) */
+ tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+ tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+ /* sqrt(2) * (c7-c3) */
+ z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+ z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+ /* sqrt(2) * (-c1-c3) */
+ z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+ z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+ /* sqrt(2) * (-c3-c5) */
+ z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+ z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+ /* sqrt(2) * (c5-c3) */
+ z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+ z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+ z3_l = vaddq_s32(z3_l, z5_l);
+ z3_h = vaddq_s32(z3_h, z5_h);
+ z4_l = vaddq_s32(z4_l, z5_l);
+ z4_h = vaddq_s32(z4_h, z5_h);
+
+ tmp4_l = vaddq_s32(tmp4_l, z1_l);
+ tmp4_h = vaddq_s32(tmp4_h, z1_h);
+ tmp4_l = vaddq_s32(tmp4_l, z3_l);
+ tmp4_h = vaddq_s32(tmp4_h, z3_h);
+ row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
+ vrshrn_n_s32(tmp4_h, DESCALE_P2));
+
+ tmp5_l = vaddq_s32(tmp5_l, z2_l);
+ tmp5_h = vaddq_s32(tmp5_h, z2_h);
+ tmp5_l = vaddq_s32(tmp5_l, z4_l);
+ tmp5_h = vaddq_s32(tmp5_h, z4_h);
+ row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
+ vrshrn_n_s32(tmp5_h, DESCALE_P2));
+
+ tmp6_l = vaddq_s32(tmp6_l, z2_l);
+ tmp6_h = vaddq_s32(tmp6_h, z2_h);
+ tmp6_l = vaddq_s32(tmp6_l, z3_l);
+ tmp6_h = vaddq_s32(tmp6_h, z3_h);
+ row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
+ vrshrn_n_s32(tmp6_h, DESCALE_P2));
+
+ tmp7_l = vaddq_s32(tmp7_l, z1_l);
+ tmp7_h = vaddq_s32(tmp7_h, z1_h);
+ tmp7_l = vaddq_s32(tmp7_l, z4_l);
+ tmp7_h = vaddq_s32(tmp7_h, z4_h);
+ row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
+ vrshrn_n_s32(tmp7_h, DESCALE_P2));
+
+ vst1q_s16(data + 0 * DCTSIZE, row0);
+ vst1q_s16(data + 1 * DCTSIZE, row1);
+ vst1q_s16(data + 2 * DCTSIZE, row2);
+ vst1q_s16(data + 3 * DCTSIZE, row3);
+ vst1q_s16(data + 4 * DCTSIZE, row4);
+ vst1q_s16(data + 5 * DCTSIZE, row5);
+ vst1q_s16(data + 6 * DCTSIZE, row6);
+ vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/media/libjpeg/simd/arm/jidctfst-neon.c b/media/libjpeg/simd/arm/jidctfst-neon.c
new file mode 100644
index 0000000000..a91be5362e
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctfst-neon.c
@@ -0,0 +1,472 @@
+/*
+ * jidctfst-neon.c - fast integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients. It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.082392200 = 2688 * 2^-15
+ * 0.414213562 = 13568 * 2^-15
+ * 0.847759065 = 27776 * 2^-15
+ * 0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm. Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_neon() match up
+ * with those in jpeg_idct_ifast().
+ */
+
+#define PASS1_BITS 2
+
+#define F_0_082 2688
+#define F_0_414 13568
+#define F_0_847 27776
+#define F_0_613 20096
+
+
+ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
+ F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ IFAST_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table values for DC coefficients. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ /* Dequantize DC coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+
+ /* Construct bitmap to test if all AC coefficients are 0. */
+ int16x8_t bitmap = vorrq_s16(row1, row2);
+ bitmap = vorrq_s16(bitmap, row3);
+ bitmap = vorrq_s16(bitmap, row4);
+ bitmap = vorrq_s16(bitmap, row5);
+ bitmap = vorrq_s16(bitmap, row6);
+ bitmap = vorrq_s16(bitmap, row7);
+
+ int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+ int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+ /* Load IDCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
+
+ if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+ /* All AC coefficients are zero.
+ * Compute DC values and duplicate into vectors.
+ */
+ int16x8_t dcval = row0;
+ row1 = dcval;
+ row2 = dcval;
+ row3 = dcval;
+ row4 = dcval;
+ row5 = dcval;
+ row6 = dcval;
+ row7 = dcval;
+ } else if (left_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 0, 1, 2, and 3.
+ * Use DC values for these columns.
+ */
+ int16x4_t dcval = vget_low_s16(row0);
+
+ /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
+
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Even part: dequantize DCT coefficients. */
+ int16x4_t tmp0 = vget_high_s16(row0);
+ int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
+ int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
+ int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+ int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
+ int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+ int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
+ int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+ int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+ tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+ tmp12 = vsub_s16(tmp12, tmp13);
+
+ tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
+ tmp3 = vsub_s16(tmp10, tmp13);
+ tmp1 = vadd_s16(tmp11, tmp12);
+ tmp2 = vsub_s16(tmp11, tmp12);
+
+ /* Odd part: dequantize DCT coefficients. */
+ int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
+ int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
+ int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
+ int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
+
+ int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
+ int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+ int16x4_t z11 = vadd_s16(tmp4, tmp7);
+ int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+ tmp7 = vadd_s16(z11, z13); /* phase 5 */
+ int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+ tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+ tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+ int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+ int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+ z5 = vadd_s16(z5, z10_add_z12);
+ tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+ tmp10 = vadd_s16(tmp10, z12);
+ tmp10 = vsub_s16(tmp10, z5);
+ tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+ tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+ tmp12 = vadd_s16(tmp12, z5);
+
+ tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
+ tmp5 = vsub_s16(tmp11, tmp6);
+ tmp4 = vadd_s16(tmp10, tmp5);
+
+ row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
+ row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
+ row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
+ row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
+ row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
+ row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
+ row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
+ row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
+ } else if (right_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 4, 5, 6, and 7.
+ * Use DC values for these columns.
+ */
+ int16x4_t dcval = vget_high_s16(row0);
+
+ /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
+
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part: dequantize DCT coefficients. */
+ int16x4_t tmp0 = vget_low_s16(row0);
+ int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
+ int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
+ int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+ int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
+ int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+ int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
+ int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+ int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+ tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+ tmp12 = vsub_s16(tmp12, tmp13);
+
+ tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
+ tmp3 = vsub_s16(tmp10, tmp13);
+ tmp1 = vadd_s16(tmp11, tmp12);
+ tmp2 = vsub_s16(tmp11, tmp12);
+
+ /* Odd part: dequantize DCT coefficients. */
+ int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
+ int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
+ int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
+ int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
+
+ int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
+ int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+ int16x4_t z11 = vadd_s16(tmp4, tmp7);
+ int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+ tmp7 = vadd_s16(z11, z13); /* phase 5 */
+ int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+ tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+ tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+ int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+ int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+ z5 = vadd_s16(z5, z10_add_z12);
+ tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+ tmp10 = vadd_s16(tmp10, z12);
+ tmp10 = vsub_s16(tmp10, z5);
+ tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+ tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+ tmp12 = vadd_s16(tmp12, z5);
+
+ tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
+ tmp5 = vsub_s16(tmp11, tmp6);
+ tmp4 = vadd_s16(tmp10, tmp5);
+
+ row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
+ row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
+ row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
+ row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
+ row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
+ row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
+ row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
+ row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
+ } else {
+ /* Some AC coefficients are non-zero; full IDCT calculation required. */
+
+ /* Load quantization table. */
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part: dequantize DCT coefficients. */
+ int16x8_t tmp0 = row0;
+ int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
+ int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
+ int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
+
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */
+ int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
+
+ int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */
+ int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
+ int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
+ tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
+ tmp12 = vsubq_s16(tmp12, tmp13);
+
+ tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */
+ tmp3 = vsubq_s16(tmp10, tmp13);
+ tmp1 = vaddq_s16(tmp11, tmp12);
+ tmp2 = vsubq_s16(tmp11, tmp12);
+
+ /* Odd part: dequantize DCT coefficients. */
+ int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
+ int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
+ int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
+ int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
+
+ int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */
+ int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
+ int16x8_t z11 = vaddq_s16(tmp4, tmp7);
+ int16x8_t z12 = vsubq_s16(tmp4, tmp7);
+
+ tmp7 = vaddq_s16(z11, z13); /* phase 5 */
+ int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+ tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+ tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+ int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+ int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+ z5 = vaddq_s16(z5, z10_add_z12);
+ tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+ tmp10 = vaddq_s16(tmp10, z12);
+ tmp10 = vsubq_s16(tmp10, z5);
+ tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+ tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+ tmp12 = vaddq_s16(tmp12, z5);
+
+ tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
+ tmp5 = vsubq_s16(tmp11, tmp6);
+ tmp4 = vaddq_s16(tmp10, tmp5);
+
+ row0 = vaddq_s16(tmp0, tmp7);
+ row7 = vsubq_s16(tmp0, tmp7);
+ row1 = vaddq_s16(tmp1, tmp6);
+ row6 = vsubq_s16(tmp1, tmp6);
+ row2 = vaddq_s16(tmp2, tmp5);
+ row5 = vsubq_s16(tmp2, tmp5);
+ row4 = vaddq_s16(tmp3, tmp4);
+ row3 = vsubq_s16(tmp3, tmp4);
+ }
+
+ /* Transpose rows to work on columns in pass 2. */
+ int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
+ int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
+ int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
+ int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
+
+ int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
+ vreinterpretq_s32_s16(rows_45.val[0]));
+ int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
+ vreinterpretq_s32_s16(rows_45.val[1]));
+ int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
+ vreinterpretq_s32_s16(rows_67.val[0]));
+ int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
+ vreinterpretq_s32_s16(rows_67.val[1]));
+
+ int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
+ int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
+ int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
+ int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
+
+ int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
+ int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
+ int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
+ int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
+ int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
+ int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
+ int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
+ int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
+
+ /* 1-D IDCT, pass 2 */
+
+ /* Even part */
+ int16x8_t tmp10 = vaddq_s16(col0, col4);
+ int16x8_t tmp11 = vsubq_s16(col0, col4);
+
+ int16x8_t tmp13 = vaddq_s16(col2, col6);
+ int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
+ int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
+ tmp12 = vaddq_s16(tmp12, col2_sub_col6);
+ tmp12 = vsubq_s16(tmp12, tmp13);
+
+ int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
+ int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
+ int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
+ int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
+
+ /* Odd part */
+ int16x8_t z13 = vaddq_s16(col5, col3);
+ int16x8_t neg_z10 = vsubq_s16(col3, col5);
+ int16x8_t z11 = vaddq_s16(col1, col7);
+ int16x8_t z12 = vsubq_s16(col1, col7);
+
+ int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */
+ int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+ tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+ tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+ int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+ int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+ z5 = vaddq_s16(z5, z10_add_z12);
+ tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+ tmp10 = vaddq_s16(tmp10, z12);
+ tmp10 = vsubq_s16(tmp10, z5);
+ tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+ tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+ tmp12 = vaddq_s16(tmp12, z5);
+
+ int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
+ int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
+ int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
+
+ col0 = vaddq_s16(tmp0, tmp7);
+ col7 = vsubq_s16(tmp0, tmp7);
+ col1 = vaddq_s16(tmp1, tmp6);
+ col6 = vsubq_s16(tmp1, tmp6);
+ col2 = vaddq_s16(tmp2, tmp5);
+ col5 = vsubq_s16(tmp2, tmp5);
+ col4 = vaddq_s16(tmp3, tmp4);
+ col3 = vsubq_s16(tmp3, tmp4);
+
+ /* Scale down by a factor of 8, narrowing to 8-bit. */
+ int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
+ vqshrn_n_s16(col1, PASS1_BITS + 3));
+ int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
+ vqshrn_n_s16(col5, PASS1_BITS + 3));
+ int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
+ vqshrn_n_s16(col3, PASS1_BITS + 3));
+ int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
+ vqshrn_n_s16(col7, PASS1_BITS + 3));
+ /* Clamp to range [0-255]. */
+ uint8x16_t cols_01 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_45 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_23 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_67 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+ /* Transpose block to prepare for store. */
+ uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
+ vreinterpretq_u32_u8(cols_45));
+ uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
+ vreinterpretq_u32_u8(cols_67));
+
+ uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
+ vreinterpretq_u8_u32(cols_0415.val[1]));
+ uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
+ vreinterpretq_u8_u32(cols_2637.val[1]));
+ uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
+ vreinterpretq_u16_u8(cols_2367.val[0]));
+ uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
+ vreinterpretq_u16_u8(cols_2367.val[1]));
+
+ uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
+ uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
+ uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
+ uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
+
+ JSAMPROW outptr0 = output_buf[0] + output_col;
+ JSAMPROW outptr1 = output_buf[1] + output_col;
+ JSAMPROW outptr2 = output_buf[2] + output_col;
+ JSAMPROW outptr3 = output_buf[3] + output_col;
+ JSAMPROW outptr4 = output_buf[4] + output_col;
+ JSAMPROW outptr5 = output_buf[5] + output_col;
+ JSAMPROW outptr6 = output_buf[6] + output_col;
+ JSAMPROW outptr7 = output_buf[7] + output_col;
+
+ /* Store DCT block to memory. */
+ vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
+ vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
+ vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
+ vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
+ vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
+ vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
+ vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
+ vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
+}
diff --git a/media/libjpeg/simd/arm/jidctint-neon.c b/media/libjpeg/simd/arm/jidctint-neon.c
new file mode 100644
index 0000000000..d25112ef7f
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctint-neon.c
@@ -0,0 +1,801 @@
+/*
+ * jidctint-neon.c - accurate integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile time. Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ * 0.298631336 = 2446 * 2^-13
+ * 0.390180644 = 3196 * 2^-13
+ * 0.541196100 = 4433 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.175875602 = 9633 * 2^-13
+ * 1.501321110 = 12299 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 1.961570560 = 16069 * 2^-13
+ * 2.053119869 = 16819 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ * 3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298 2446
+#define F_0_390 3196
+#define F_0_541 4433
+#define F_0_765 6270
+#define F_0_899 7373
+#define F_1_175 9633
+#define F_1_501 12299
+#define F_1_847 15137
+#define F_1_961 16069
+#define F_2_053 16819
+#define F_2_562 20995
+#define F_3_072 25172
+
+#define F_1_175_MINUS_1_961 (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390 (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847 (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562 (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899 (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899 (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765)
+
+
+ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
+ F_0_899, F_0_541,
+ F_2_562, F_0_298_MINUS_0_899,
+ F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+ F_0_541_PLUS_0_765, F_1_175,
+ F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+ F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+ 0, 0, 0, 0
+};
+
+
+/* Forward declaration of regular and sparse IDCT helper functions */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t row4,
+ int16x4_t row5,
+ int16x4_t row6,
+ int16x4_t row7,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16x4_t quant_row4,
+ int16x4_t quant_row5,
+ int16x4_t quant_row6,
+ int16x4_t quant_row7,
+ int16_t *workspace_1,
+ int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16_t *workspace_1,
+ int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset);
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset);
+
+
+/* Perform dequantization and inverse DCT on one block of coefficients. For
+ * reference, the C implementation (jpeg_idct_slow()) can be found in
+ * jidctint.c.
+ *
+ * Optimization techniques used for fast data access:
+ *
+ * In each pass, the inverse DCT is computed for the left and right 4x8 halves
+ * of the DCT block. This avoids spilling due to register pressure, and the
+ * increased granularity allows for an optimized calculation depending on the
+ * values of the DCT coefficients. Between passes, intermediate data is stored
+ * in 4x8 workspace buffers.
+ *
+ * Transposing the 8x8 DCT block after each pass can be achieved by transposing
+ * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
+ * diagram below.) Swapping quadrants is cheap, since the second pass can just
+ * swap the workspace buffer pointers.
+ *
+ * +-------+-------+ +-------+-------+
+ * | | | | | |
+ * | 0 | 1 | | 0 | 2 |
+ * | | | transpose | | |
+ * +-------+-------+ ------> +-------+-------+
+ * | | | | | |
+ * | 2 | 3 | | 1 | 3 |
+ * | | | | | |
+ * +-------+-------+ +-------+-------+
+ *
+ * Optimization techniques used to accelerate the inverse DCT calculation:
+ *
+ * In a DCT coefficient block, the coefficients are increasingly likely to be 0
+ * as you move diagonally from top left to bottom right. If whole rows of
+ * coefficients are 0, then the inverse DCT calculation can be simplified. On
+ * the first pass of the inverse DCT, we test for three special cases before
+ * defaulting to a full "regular" inverse DCT:
+ *
+ * 1) Coefficients in rows 4-7 are all zero. In this case, we perform a
+ * "sparse" simplified inverse DCT on rows 0-3.
+ * 2) AC coefficients (rows 1-7) are all zero. In this case, the inverse DCT
+ * result is equal to the dequantized DC coefficients.
+ * 3) AC and DC coefficients are all zero. In this case, the inverse DCT
+ * result is all zero. For the left 4x8 half, this is handled identically
+ * to Case 2 above. For the right 4x8 half, we do no work and signal that
+ * the "sparse" algorithm is required for the second pass.
+ *
+ * In the second pass, only a single special case is tested: whether the AC and
+ * DC coefficients were all zero in the right 4x8 block during the first pass
+ * (refer to Case 3 above.) If this is the case, then a "sparse" variant of
+ * the second pass is performed for both the left and right halves of the DCT
+ * block. (The transposition after the first pass means that the right 4x8
+ * block during the first pass becomes rows 4-7 during the second pass.)
+ */
+
+void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ int16_t workspace_l[8 * DCTSIZE / 2];
+ int16_t workspace_r[8 * DCTSIZE / 2];
+
+ /* Compute IDCT first pass on left 4x8 coefficient block. */
+
+ /* Load DCT coefficients in left 4x8 block. */
+ int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
+ int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
+ int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
+ int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
+ int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
+ int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
+ int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
+ int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table for left 4x8 block. */
+ int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
+ int16x4_t bitmap = vorr_s16(row7, row6);
+ bitmap = vorr_s16(bitmap, row5);
+ bitmap = vorr_s16(bitmap, row4);
+ int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ if (bitmap_rows_4567 == 0) {
+ bitmap = vorr_s16(bitmap, row3);
+ bitmap = vorr_s16(bitmap, row2);
+ bitmap = vorr_s16(bitmap, row1);
+ int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ if (left_ac_bitmap == 0) {
+ int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+ int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+ /* Store 4x4 blocks to workspace, transposing in the process. */
+ vst4_s16(workspace_l, quadrant);
+ vst4_s16(workspace_r, quadrant);
+ } else {
+ jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+ quant_row1, quant_row2, quant_row3,
+ workspace_l, workspace_r);
+ }
+ } else {
+ jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+ row6, row7, quant_row0, quant_row1,
+ quant_row2, quant_row3, quant_row4,
+ quant_row5, quant_row6, quant_row7,
+ workspace_l, workspace_r);
+ }
+
+ /* Compute IDCT first pass on right 4x8 coefficient block. */
+
+ /* Load DCT coefficients in right 4x8 block. */
+ row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
+ row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
+ row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
+ row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
+ row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
+ row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
+ row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
+ row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
+
+ /* Load quantization table for right 4x8 block. */
+ quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
+ quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+ quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
+ bitmap = vorr_s16(row7, row6);
+ bitmap = vorr_s16(bitmap, row5);
+ bitmap = vorr_s16(bitmap, row4);
+ bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+ bitmap = vorr_s16(bitmap, row3);
+ bitmap = vorr_s16(bitmap, row2);
+ bitmap = vorr_s16(bitmap, row1);
+ int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ /* If this remains non-zero, a "regular" second pass will be performed. */
+ int64_t right_ac_dc_bitmap = 1;
+
+ if (right_ac_bitmap == 0) {
+ bitmap = vorr_s16(bitmap, row0);
+ right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ if (right_ac_dc_bitmap != 0) {
+ int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+ int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+ /* Store 4x4 blocks to workspace, transposing in the process. */
+ vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
+ vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
+ }
+ } else {
+ if (bitmap_rows_4567 == 0) {
+ jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+ quant_row1, quant_row2, quant_row3,
+ workspace_l + 4 * DCTSIZE / 2,
+ workspace_r + 4 * DCTSIZE / 2);
+ } else {
+ jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+ row6, row7, quant_row0, quant_row1,
+ quant_row2, quant_row3, quant_row4,
+ quant_row5, quant_row6, quant_row7,
+ workspace_l + 4 * DCTSIZE / 2,
+ workspace_r + 4 * DCTSIZE / 2);
+ }
+ }
+
+ /* Second pass: compute IDCT on rows in workspace. */
+
+ /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
+ if (right_ac_dc_bitmap == 0) {
+ jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
+ jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
+ } else {
+ jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
+ jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
+ }
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients. (To process the full 8x8 DCT block, this
+ * function-- or some other optimized variant-- needs to be called for both the
+ * left and right 4x8 blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of AC coefficients is all 0.
+ *
+ * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
+ * found in jidctint.c. Algorithmic changes made here are documented inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t row4,
+ int16x4_t row5,
+ int16x4_t row6,
+ int16x4_t row7,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16x4_t quant_row4,
+ int16x4_t quant_row5,
+ int16x4_t quant_row6,
+ int16x4_t quant_row7,
+ int16_t *workspace_1,
+ int16_t *workspace_2)
+{
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part */
+ int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+ int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+ tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+ z2_s16 = vmul_s16(row0, quant_row0);
+ z3_s16 = vmul_s16(row4, quant_row4);
+
+ int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part */
+ int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
+ int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
+ int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+ int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+ z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+ int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
+ * z5 = (z3 + z4) * 1.175875602;
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ * z3 += z5; z4 += z5;
+ *
+ * This implementation:
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+ */
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ * tmp0 += z1 + z3; tmp1 += z2 + z4;
+ * tmp2 += z2 + z3; tmp3 += z1 + z4;
+ *
+ * This implementation:
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ * tmp0 += z3; tmp1 += z4;
+ * tmp2 += z3; tmp3 += z4;
+ */
+
+ tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+ tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+ tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+ tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+ tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+ tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+ tmp0 = vaddq_s32(tmp0, z3);
+ tmp1 = vaddq_s32(tmp1, z4);
+ tmp2 = vaddq_s32(tmp2, z3);
+ tmp3 = vaddq_s32(tmp3, z4);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x4x4_t rows_0123 = { {
+ vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+ } };
+ int16x4x4_t rows_4567 = { {
+ vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+ } };
+
+ /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+ * (VST4 transposes the blocks. We need to operate on rows in the next
+ * pass.)
+ */
+ vst4_s16(workspace_1, rows_0123);
+ vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.
+ *
+ * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
+ * 0. This simplifies the IDCT calculation, accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16_t *workspace_1,
+ int16_t *workspace_2)
+{
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part (z3 is all 0) */
+ int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+ z2_s16 = vmul_s16(row0, quant_row0);
+ int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part (tmp0 and tmp1 are both all 0) */
+ int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+ int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+ int16x4_t z3_s16 = tmp2_s16;
+ int16x4_t z4_s16 = tmp3_s16;
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+ tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x4x4_t rows_0123 = { {
+ vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+ } };
+ int16x4x4_t rows_4567 = { {
+ vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+ } };
+
+ /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+ * (VST4 transposes the blocks. We need to operate on rows in the next
+ * pass.)
+ */
+ vst4_s16(workspace_1, rows_0123);
+ vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
+ * coefficients. (To process the full 8x8 DCT block, this function-- or some
+ * other optimized variant-- needs to be called for both the right and left 4x8
+ * blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
+ * can be found in jidctint.c. Algorithmic changes made here are documented
+ * inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset)
+{
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part */
+ int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+ int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+ tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+ z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+ z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
+
+ int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part */
+ int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
+ int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
+ int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+ int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+ z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+ int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
+ * z5 = (z3 + z4) * 1.175875602;
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ * z3 += z5; z4 += z5;
+ *
+ * This implementation:
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+ */
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ * tmp0 += z1 + z3; tmp1 += z2 + z4;
+ * tmp2 += z2 + z3; tmp3 += z1 + z4;
+ *
+ * This implementation:
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ * tmp0 += z3; tmp1 += z4;
+ * tmp2 += z3; tmp3 += z4;
+ */
+
+ tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+ tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+ tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+ tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+ tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+ tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+ tmp0 = vaddq_s32(tmp0, z3);
+ tmp1 = vaddq_s32(tmp1, z4);
+ tmp2 = vaddq_s32(tmp2, z3);
+ tmp3 = vaddq_s32(tmp3, z4);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+ vaddhn_s32(tmp12, tmp1));
+ int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+ vaddhn_s32(tmp13, tmp0));
+ int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+ vsubhn_s32(tmp11, tmp2));
+ int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+ vsubhn_s32(tmp10, tmp3));
+ /* Descale and narrow to 8-bit. */
+ int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+ int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+ int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+ int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+ /* Clamp to range [0-255]. */
+ uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+
+ /* Transpose 4x8 block and store to memory. (Zipping adjacent columns
+ * together allows us to store 16-bit elements.)
+ */
+ uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+ uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+ uint16x4x4_t cols_01_23_45_67 = { {
+ vreinterpret_u16_u8(cols_01_23.val[0]),
+ vreinterpret_u16_u8(cols_01_23.val[1]),
+ vreinterpret_u16_u8(cols_45_67.val[0]),
+ vreinterpret_u16_u8(cols_45_67.val[1])
+ } };
+
+ JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+ JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+ JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+ JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+ /* VST4 of 16-bit elements completes the transpose. */
+ vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+ vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+ vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+ vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
+
+
+/* Performs the second pass of the accurate inverse DCT on a 4x8 block
+ * of coefficients.
+ *
+ * This "sparse" version assumes that the coefficient values (after the first
+ * pass) in rows 4-7 are all 0. This simplifies the IDCT calculation,
+ * accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset)
+{
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part (z3 is all 0) */
+ int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+ z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+ int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part (tmp0 and tmp1 are both all 0) */
+ int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+ int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+ int16x4_t z3_s16 = tmp2_s16;
+ int16x4_t z4_s16 = tmp3_s16;
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+ tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+ vaddhn_s32(tmp12, tmp1));
+ int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+ vaddhn_s32(tmp13, tmp0));
+ int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+ vsubhn_s32(tmp11, tmp2));
+ int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+ vsubhn_s32(tmp10, tmp3));
+ /* Descale and narrow to 8-bit. */
+ int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+ int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+ int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+ int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+ /* Clamp to range [0-255]. */
+ uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+
+ /* Transpose 4x8 block and store to memory. (Zipping adjacent columns
+ * together allows us to store 16-bit elements.)
+ */
+ uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+ uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+ uint16x4x4_t cols_01_23_45_67 = { {
+ vreinterpret_u16_u8(cols_01_23.val[0]),
+ vreinterpret_u16_u8(cols_01_23.val[1]),
+ vreinterpret_u16_u8(cols_45_67.val[0]),
+ vreinterpret_u16_u8(cols_45_67.val[1])
+ } };
+
+ JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+ JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+ JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+ JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+ /* VST4 of 16-bit elements completes the transpose. */
+ vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+ vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+ vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+ vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
diff --git a/media/libjpeg/simd/arm/jidctred-neon.c b/media/libjpeg/simd/arm/jidctred-neon.c
new file mode 100644
index 0000000000..be9627e61d
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctred-neon.c
@@ -0,0 +1,486 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define F_0_211 1730
+#define F_0_509 4176
+#define F_0_601 4926
+#define F_0_720 5906
+#define F_0_765 6270
+#define F_0_850 6967
+#define F_0_899 7373
+#define F_1_061 8697
+#define F_1_272 10426
+#define F_1_451 11893
+#define F_1_847 15137
+#define F_2_172 17799
+#define F_2_562 20995
+#define F_3_624 29692
+
+
+/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block. It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.720959822 = 5906 * 2^-13
+ * 0.850430095 = 6967 * 2^-13
+ * 1.272758580 = 10426 * 2^-13
+ * 3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 IDCT algorithm. Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_neon()
+ * match up with those in jpeg_idct_2x2().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
+ -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table values. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Dequantize DCT coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+ row1 = vmulq_s16(row1, quant_row1);
+ row3 = vmulq_s16(row3, quant_row3);
+ row5 = vmulq_s16(row5, quant_row5);
+ row7 = vmulq_s16(row7, quant_row7);
+
+ /* Load IDCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
+
+ /* Pass 1: process columns from input, put results in vectors row0 and
+ * row1.
+ */
+
+ /* Even part */
+ int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+ int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+ /* Odd part */
+ int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
+ int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+ vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+ row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+ vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+ /* Transpose two rows, ready for second pass. */
+ int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+ int16x8_t cols_0246 = cols_0246_1357.val[0];
+ int16x8_t cols_1357 = cols_0246_1357.val[1];
+ /* Duplicate columns such that each is accessible in its own vector. */
+ int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+ vreinterpretq_s32_s16(cols_1357));
+ int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+ int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+ /* Pass 2: process two rows, store to output array. */
+
+ /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+ * care."
+ */
+ int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+ /* Odd part: we're only interested in the bottom half of tmp0. */
+ int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
+ tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
+ tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
+ tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
+
+ /* Final output stage: descale and clamp to range [0-255]. */
+ int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+ vsubhn_s32(tmp10, tmp0));
+ output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+ CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+ /* Narrow to 8-bit and convert to unsigned. */
+ uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+ /* Store 2x2 block to memory. */
+ vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+ vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+ vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+ vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}
+
+
+/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block. It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.211164243 = 1730 * 2^-13
+ * 0.509795579 = 4176 * 2^-13
+ * 0.601344887 = 4926 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.061594337 = 8697 * 2^-13
+ * 1.451774981 = 11893 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 2.172734803 = 17799 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 IDCT algorithm. Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_neon()
+ * match up with those in jpeg_idct_4x4().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
+ F_1_847, -F_0_765, -F_0_211, F_1_451,
+ -F_2_172, F_1_061, -F_0_509, -F_0_601,
+ F_0_899, F_2_562, 0, 0
+};
+
+void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table values for DC coefficients. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ /* Dequantize DC coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+
+ /* Construct bitmap to test if all AC coefficients are 0. */
+ int16x8_t bitmap = vorrq_s16(row1, row2);
+ bitmap = vorrq_s16(bitmap, row3);
+ bitmap = vorrq_s16(bitmap, row5);
+ bitmap = vorrq_s16(bitmap, row6);
+ bitmap = vorrq_s16(bitmap, row7);
+
+ int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+ int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+ /* All AC coefficients are zero.
+ * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
+ */
+ int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
+ row0 = dcval;
+ row1 = dcval;
+ row2 = dcval;
+ row3 = dcval;
+ } else if (left_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 0, 1, 2, and 3.
+ * Compute DC values for these columns.
+ */
+ int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
+
+ /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
+
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Even part */
+ int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+ int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
+ int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+ /* Odd part */
+ int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
+ z2 = vmul_s16(vget_high_s16(row5), quant_row5);
+ z3 = vmul_s16(vget_high_s16(row3), quant_row3);
+ int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
+
+ tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+ tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+ tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+ tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+ tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1));
+ row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1));
+ row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1));
+ row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1));
+ } else if (right_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 4, 5, 6, and 7.
+ * Compute DC values for these columns.
+ */
+ int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
+
+ /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
+
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part */
+ int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+
+ int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
+ int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+ /* Odd part */
+ int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
+ z2 = vmul_s16(vget_low_s16(row5), quant_row5);
+ z3 = vmul_s16(vget_low_s16(row3), quant_row3);
+ int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
+
+ tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+ tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+ tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+ tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+ tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ } else {
+ /* All AC coefficients are non-zero; full IDCT calculation required. */
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part */
+ int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+ int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+ int16x8_t z2 = vmulq_s16(row2, quant_row2);
+ int16x8_t z3 = vmulq_s16(row6, quant_row6);
+
+ int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
+ int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
+
+ int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
+ int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
+ int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
+ int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
+
+ /* Odd part */
+ int16x8_t z1 = vmulq_s16(row7, quant_row7);
+ z2 = vmulq_s16(row5, quant_row5);
+ z3 = vmulq_s16(row3, quant_row3);
+ int16x8_t z4 = vmulq_s16(row1, quant_row1);
+
+ tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
+ tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
+
+ tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
+ tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
+ CONST_BITS - PASS1_BITS + 1));
+ row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
+ CONST_BITS - PASS1_BITS + 1));
+ row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
+ CONST_BITS - PASS1_BITS + 1));
+ row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
+ CONST_BITS - PASS1_BITS + 1));
+ }
+
+ /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+ int16x8x2_t row_01 = vtrnq_s16(row0, row1);
+ int16x8x2_t row_23 = vtrnq_s16(row2, row3);
+
+ int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
+ vreinterpretq_s32_s16(row_23.val[0]));
+ int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
+ vreinterpretq_s32_s16(row_23.val[1]));
+
+ int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
+ int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
+ int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
+ int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
+ int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
+ int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
+ int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
+
+ /* Commence second pass of IDCT. */
+
+ /* Even part */
+ int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
+ int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
+ tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+ /* Odd part */
+ tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
+ tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
+ tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
+ tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
+
+ tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
+ tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
+ tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
+
+ /* Final output stage: descale and clamp to range [0-255]. */
+ int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
+ vsubhn_s32(tmp12, tmp0));
+ int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
+ vsubhn_s32(tmp10, tmp2));
+ output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
+ CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+ output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
+ CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+ /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
+ * An interleaving store completes the transpose.
+ */
+ uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
+ vqmovun_s16(output_cols_13));
+ uint16x4x2_t output_01_23 = { {
+ vreinterpret_u16_u8(output_0123.val[0]),
+ vreinterpret_u16_u8(output_0123.val[1])
+ } };
+
+ /* Store 4x4 block to memory. */
+ JSAMPROW outptr0 = output_buf[0] + output_col;
+ JSAMPROW outptr1 = output_buf[1] + output_col;
+ JSAMPROW outptr2 = output_buf[2] + output_col;
+ JSAMPROW outptr3 = output_buf[3] + output_col;
+ vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+ vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+ vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+ vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
+}
diff --git a/media/libjpeg/simd/arm/jquanti-neon.c b/media/libjpeg/simd/arm/jquanti-neon.c
new file mode 100644
index 0000000000..d5d95d89f6
--- /dev/null
+++ b/media/libjpeg/simd/arm/jquanti-neon.c
@@ -0,0 +1,193 @@
+/*
+ * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* After downsampling, the resulting sample values are in the range [0, 255],
+ * but the Discrete Cosine Transform (DCT) operates on values centered around
+ * 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+ uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+ uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+ uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+ uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+ uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+ uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+ uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+ int16x8_t row0 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row1 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row2 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row3 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row4 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row5 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row6 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row7 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
+
+ vst1q_s16(workspace + 0 * DCTSIZE, row0);
+ vst1q_s16(workspace + 1 * DCTSIZE, row1);
+ vst1q_s16(workspace + 2 * DCTSIZE, row2);
+ vst1q_s16(workspace + 3 * DCTSIZE, row3);
+ vst1q_s16(workspace + 4 * DCTSIZE, row4);
+ vst1q_s16(workspace + 5 * DCTSIZE, row5);
+ vst1q_s16(workspace + 6 * DCTSIZE, row6);
+ vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
+
+
+/* After the DCT, the resulting array of coefficient values needs to be divided
+ * by an array of quantization values.
+ *
+ * To avoid a slow division operation, the DCT coefficients are multiplied by
+ * the (scaled) reciprocals of the quantization values and then right-shifted.
+ *
+ * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
+ */
+
+void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+ DCTELEM *workspace)
+{
+ JCOEFPTR out_ptr = coef_block;
+ UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
+ UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
+ DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+ int i;
+
+#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
+#pragma unroll
+#endif
+ for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
+ /* Load reciprocals of quantization values. */
+ uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
+ uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
+ uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
+ uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
+ uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
+ uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
+ uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
+ uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
+ int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
+ int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
+ int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
+ int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
+
+ /* Extract sign from coefficients. */
+ int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
+ int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
+ int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
+ int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
+ /* Get absolute value of DCT coefficients. */
+ uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
+ uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
+ uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
+ uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
+ /* Add correction. */
+ abs_row0 = vaddq_u16(abs_row0, corr0);
+ abs_row1 = vaddq_u16(abs_row1, corr1);
+ abs_row2 = vaddq_u16(abs_row2, corr2);
+ abs_row3 = vaddq_u16(abs_row3, corr3);
+
+ /* Multiply DCT coefficients by quantization reciprocals. */
+ int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
+ vget_low_u16(recip0)));
+ int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
+ vget_high_u16(recip0)));
+ int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
+ vget_low_u16(recip1)));
+ int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
+ vget_high_u16(recip1)));
+ int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
+ vget_low_u16(recip2)));
+ int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
+ vget_high_u16(recip2)));
+ int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
+ vget_low_u16(recip3)));
+ int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
+ vget_high_u16(recip3)));
+ /* Narrow back to 16-bit. */
+ row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
+ row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
+ row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
+ row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
+
+ /* Since VSHR only supports an immediate as its second argument, negate the
+ * shift value and shift left.
+ */
+ row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
+ vnegq_s16(shift0)));
+ row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
+ vnegq_s16(shift1)));
+ row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
+ vnegq_s16(shift2)));
+ row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
+ vnegq_s16(shift3)));
+
+ /* Restore sign to original product. */
+ row0 = veorq_s16(row0, sign_row0);
+ row0 = vsubq_s16(row0, sign_row0);
+ row1 = veorq_s16(row1, sign_row1);
+ row1 = vsubq_s16(row1, sign_row1);
+ row2 = veorq_s16(row2, sign_row2);
+ row2 = vsubq_s16(row2, sign_row2);
+ row3 = veorq_s16(row3, sign_row3);
+ row3 = vsubq_s16(row3, sign_row3);
+
+ /* Store quantized coefficients to memory. */
+ vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
+ vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
+ vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
+ vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
+ }
+}
diff --git a/media/libjpeg/simd/arm/neon-compat.h b/media/libjpeg/simd/arm/neon-compat.h
new file mode 100644
index 0000000000..2907634e26
--- /dev/null
+++ b/media/libjpeg/simd/arm/neon-compat.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* Define compiler-independent count-leading-zeros and byte-swap macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x) _CountLeadingZeros(x)
+#define BUILTIN_CLZLL(x) _CountLeadingZeros64(x)
+#define BUILTIN_BSWAP64(x) _byteswap_uint64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x) __builtin_clz(x)
+#define BUILTIN_CLZLL(x) __builtin_clzll(x)
+#define BUILTIN_BSWAP64(x) __builtin_bswap64(x)
+#else
+#error "Unknown compiler"
+#endif
diff --git a/media/libjpeg/simd/i386/jccolext-avx2.asm b/media/libjpeg/simd/i386/jccolext-avx2.asm
new file mode 100644
index 0000000000..c46d684436
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-avx2.asm
@@ -0,0 +1,578 @@
+;
+; jccolext.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION img_width
+%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b) + 20 ; JDIMENSION output_row
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+ ; ymmword wk[WK_NUM]
+%define WK_NUM 8
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)]
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ pushpic eax
+ push edx
+ push ebx
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ mov ebx, JSAMPROW [ebx] ; outptr1
+ mov edx, JSAMPROW [edx] ; outptr2
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ alignx 16, 7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ movzx eax, byte [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ movzx edx, word [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
+.column_ld4:
+ vmovd xmmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ vmovd xmmF, XMM_DWORD [esi+ecx]
+ vpslldq xmmA, xmmA, SIZEOF_DWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_MMWORD
+ vmovq xmmB, XMM_MMWORD [esi+ecx]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ sub ecx, byte SIZEOF_XMMWORD
+ vmovdqu xmmB, XMM_MMWORD [esi+ecx]
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ vpor ymmA, ymmB
+.column_ld32:
+ test cl, SIZEOF_YMMWORD
+ jz short .column_ld64
+ sub ecx, byte SIZEOF_YMMWORD
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+ test cl, 2*SIZEOF_YMMWORD
+ mov ecx, SIZEOF_YMMWORD
+ jz short .rgb_ycc_cnv
+ vmovdqa ymmB, ymmA
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+ ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vmovdqu ymmC, ymmA
+ vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+ vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+ vmovdqa ymmG, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+ ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+ vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+ ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+ ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+ vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+ ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+ vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+ ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+ vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+ ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+ vmovdqa ymmD, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+ ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+ vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+ ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+ vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+ ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+ ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+ ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+ vmovdqa ymmE, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+ ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+ vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+ ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+ ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+ ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+ vpxor ymmH, ymmH, ymmH
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmB, ymmE
+ vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+ vmovdqa ymmF, ymmD
+ vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_XMMWORD/16
+ vmovd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ vmovq xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_XMMWORD/4
+ vmovdqa xmmF, xmmA
+ vperm2i128 ymmF, ymmF, ymmF, 1
+ vmovdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+ vpor ymmA, ymmA, ymmF
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_XMMWORD/2
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ mov ecx, SIZEOF_YMMWORD
+ jz short .rgb_ycc_cnv
+ vmovdqa ymmE, ymmA
+ vmovdqa ymmH, ymmF
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+ vmovdqu ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+ ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmB, ymmA
+ vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+ vmovdqa ymmB, ymmF
+ vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+ vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmD, ymmA
+ vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+ ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+ vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+ ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+ vmovdqa ymmC, ymmF
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+ ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+ ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+ vmovdqa ymmB, ymmA
+ vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+ vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+ ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+ vmovdqa ymmG, ymmD
+ vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+ ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+ vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+ ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+ vmovdqa ymmE, ymmA
+ vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vmovdqa ymmH, ymmB
+ vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+ ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+ ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+ vpxor ymmF, ymmF, ymmF
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmD, ymmB
+ vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+ vmovdqa ymmG, ymmE
+ vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vpunpcklbw ymmF, ymmF, ymmH
+ vpunpckhbw ymmH, ymmH, ymmH
+ vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+ vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+ ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE
+ vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO
+ vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE
+ vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO
+
+ vmovdqa ymm6, ymm1
+ vpunpcklwd ymm1, ymm1, ymm3
+ vpunpckhwd ymm6, ymm6, ymm3
+ vmovdqa ymm7, ymm1
+ vmovdqa ymm4, ymm6
+ vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF016_MF033)] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ vpxor ymm1, ymm1, ymm1
+ vpxor ymm6, ymm6, ymm6
+ vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL
+ vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH
+ vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500)
+ vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500)
+
+ vmovdqa ymm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm5=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm7, ymm7, ymm1
+ vpaddd ymm4, ymm4, ymm6
+ vpaddd ymm7, ymm7, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH
+ vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO
+
+ vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE
+
+ vmovdqa ymm6, ymm0
+ vpunpcklwd ymm0, ymm0, ymm2
+ vpunpckhwd ymm6, ymm6, ymm2
+ vmovdqa ymm5, ymm0
+ vmovdqa ymm4, ymm6
+ vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF016_MF033)] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ vpxor ymm0, ymm0, ymm0
+ vpxor ymm6, ymm6, ymm6
+ vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL
+ vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH
+ vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500)
+ vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500)
+
+ vmovdqa ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm1=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm5, ymm5, ymm0
+ vpaddd ymm4, ymm4, ymm6
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm4, ymm4, ymm1
+ vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH
+ vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE
+
+ vpsllw ymm7, ymm7, BYTE_BIT
+ vpor ymm5, ymm5, ymm7 ; ymm5=Cb
+ vmovdqu YMMWORD [ebx], ymm5 ; Save Cb
+
+ vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO
+ vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE
+ vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO
+
+ vmovdqa ymm4, ymm0
+ vpunpcklwd ymm0, ymm0, ymm3
+ vpunpckhwd ymm4, ymm4, ymm3
+ vmovdqa ymm7, ymm0
+ vmovdqa ymm5, ymm4
+ vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF008_MF041)] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ vmovdqa ymm3, [GOTOFF(eax,PD_ONEHALF)] ; ymm3=[PD_ONEHALF]
+
+ vpaddd ymm0, ymm0, YMMWORD [wk(4)]
+ vpaddd ymm4, ymm4, YMMWORD [wk(5)]
+ vpaddd ymm0, ymm0, ymm3
+ vpaddd ymm4, ymm4, ymm3
+ vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
+ vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
+
+ vpxor ymm3, ymm3, ymm3
+ vpxor ymm4, ymm4, ymm4
+ vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL
+ vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH
+ vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500)
+ vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500)
+
+ vmovdqa ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm1=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm7, ymm7, ymm3
+ vpaddd ymm5, ymm5, ymm4
+ vpaddd ymm7, ymm7, ymm1
+ vpaddd ymm5, ymm5, ymm1
+ vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL
+ vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH
+ vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO
+
+ vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE
+
+ vmovdqa ymm4, ymm6
+ vpunpcklwd ymm6, ymm6, ymm2
+ vpunpckhwd ymm4, ymm4, ymm2
+ vmovdqa ymm1, ymm6
+ vmovdqa ymm5, ymm4
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_MF008_MF041)] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ vmovdqa ymm2, [GOTOFF(eax,PD_ONEHALF)] ; ymm2=[PD_ONEHALF]
+
+ vpaddd ymm6, ymm6, YMMWORD [wk(6)]
+ vpaddd ymm4, ymm4, YMMWORD [wk(7)]
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm4, ymm4, ymm2
+ vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
+ vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
+
+ vpsllw ymm0, ymm0, BYTE_BIT
+ vpor ymm6, ymm6, ymm0 ; ymm6=Y
+ vmovdqu YMMWORD [edi], ymm6 ; Save Y
+
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm4, ymm4, ymm4
+ vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL
+ vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH
+ vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500)
+ vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500)
+
+ vmovdqa ymm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm0=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm1, ymm1, ymm2
+ vpaddd ymm5, ymm5, ymm4
+ vpaddd ymm1, ymm1, ymm0
+ vpaddd ymm5, ymm5, ymm0
+ vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL
+ vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH
+ vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE
+
+ vpsllw ymm7, ymm7, BYTE_BIT
+ vpor ymm1, ymm1, ymm7 ; ymm1=Cr
+ vmovdqu YMMWORD [edx], ymm1 ; Save Cr
+
+ sub ecx, byte SIZEOF_YMMWORD
+ add esi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
+ add edi, byte SIZEOF_YMMWORD ; outptr0
+ add ebx, byte SIZEOF_YMMWORD ; outptr1
+ add edx, byte SIZEOF_YMMWORD ; outptr2
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ pop ebx
+ pop edx
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jccolext-mmx.asm b/media/libjpeg/simd/i386/jccolext-mmx.asm
new file mode 100644
index 0000000000..6357a42b2c
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-mmx.asm
@@ -0,0 +1,476 @@
+;
+; jccolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION img_width
+%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b) + 20 ; JDIMENSION output_row
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 8
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)] ; num_cols
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ pushpic eax
+ push edx
+ push ebx
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ mov ebx, JSAMPROW [ebx] ; outptr1
+ mov edx, JSAMPROW [edx] ; outptr2
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jae short .columnloop
+ alignx 16, 7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ xor eax, eax
+ mov al, byte [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ xor edx, edx
+ mov dx, word [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
+.column_ld4:
+ movd mmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd mmG, dword [esi+ecx]
+ psllq mmA, DWORD_BIT
+ por mmA, mmG
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ movq mmG, mmA
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ mov ecx, SIZEOF_MMWORD
+ jmp short .rgb_ycc_cnv
+.column_ld16:
+ test cl, 2*SIZEOF_MMWORD
+ mov ecx, SIZEOF_MMWORD
+ jz short .rgb_ycc_cnv
+ movq mmF, mmA
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
+
+.columnloop:
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+ ; mmA=(00 10 20 01 11 21 02 12)
+ ; mmG=(22 03 13 23 04 14 24 05)
+ ; mmF=(15 25 06 16 26 07 17 27)
+
+ movq mmD, mmA
+ psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
+ psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
+
+ punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05)
+ psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
+
+ punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16)
+ punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27)
+
+ movq mmE, mmA
+ psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
+ psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
+
+ punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
+ psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
+
+ punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07)
+ punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27)
+
+ pxor mmH, mmH
+
+ movq mmC, mmA
+ punpcklbw mmA, mmH ; mmA=(00 02 04 06)
+ punpckhbw mmC, mmH ; mmC=(10 12 14 16)
+
+ movq mmB, mmE
+ punpcklbw mmE, mmH ; mmE=(20 22 24 26)
+ punpckhbw mmB, mmH ; mmB=(01 03 05 07)
+
+ movq mmF, mmD
+ punpcklbw mmD, mmH ; mmD=(11 13 15 17)
+ punpckhbw mmF, mmH ; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_MMWORD/8
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_MMWORD/8
+ movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_MMWORD/4
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_MMWORD/4
+ movq mmF, mmA
+ movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+ test cl, SIZEOF_MMWORD/2
+ mov ecx, SIZEOF_MMWORD
+ jz short .rgb_ycc_cnv
+ movq mmD, mmA
+ movq mmC, mmF
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
+
+.columnloop:
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+ movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+ ; mmA=(00 10 20 30 01 11 21 31)
+ ; mmF=(02 12 22 32 03 13 23 33)
+ ; mmD=(04 14 24 34 05 15 25 35)
+ ; mmC=(06 16 26 36 07 17 27 37)
+
+ movq mmB, mmA
+ punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32)
+ punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33)
+
+ movq mmG, mmD
+ punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36)
+ punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37)
+
+ movq mmE, mmA
+ punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
+ punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36)
+
+ movq mmH, mmB
+ punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17)
+ punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37)
+
+ pxor mmF, mmF
+
+ movq mmC, mmA
+ punpcklbw mmA, mmF ; mmA=(00 02 04 06)
+ punpckhbw mmC, mmF ; mmC=(10 12 14 16)
+
+ movq mmD, mmB
+ punpcklbw mmB, mmF ; mmB=(01 03 05 07)
+ punpckhbw mmD, mmF ; mmD=(11 13 15 17)
+
+ movq mmG, mmE
+ punpcklbw mmE, mmF ; mmE=(20 22 24 26)
+ punpckhbw mmG, mmF ; mmG=(30 32 34 36)
+
+ punpcklbw mmF, mmH
+ punpckhbw mmH, mmH
+ psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27)
+ psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+ ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movq MMWORD [wk(0)], mm0 ; wk(0)=RE
+ movq MMWORD [wk(1)], mm1 ; wk(1)=RO
+ movq MMWORD [wk(2)], mm4 ; wk(2)=BE
+ movq MMWORD [wk(3)], mm5 ; wk(3)=BO
+
+ movq mm6, mm1
+ punpcklwd mm1, mm3
+ punpckhwd mm6, mm3
+ movq mm7, mm1
+ movq mm4, mm6
+ pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd mm7, [GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor mm1, mm1
+ pxor mm6, mm6
+ punpcklwd mm1, mm5 ; mm1=BOL
+ punpckhwd mm6, mm5 ; mm6=BOH
+ psrld mm1, 1 ; mm1=BOL*FIX(0.500)
+ psrld mm6, 1 ; mm6=BOH*FIX(0.500)
+
+ movq mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
+
+ paddd mm7, mm1
+ paddd mm4, mm6
+ paddd mm7, mm5
+ paddd mm4, mm5
+ psrld mm7, SCALEBITS ; mm7=CbOL
+ psrld mm4, SCALEBITS ; mm4=CbOH
+ packssdw mm7, mm4 ; mm7=CbO
+
+ movq mm1, MMWORD [wk(2)] ; mm1=BE
+
+ movq mm6, mm0
+ punpcklwd mm0, mm2
+ punpckhwd mm6, mm2
+ movq mm5, mm0
+ movq mm4, mm6
+ pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd mm5, [GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor mm0, mm0
+ pxor mm6, mm6
+ punpcklwd mm0, mm1 ; mm0=BEL
+ punpckhwd mm6, mm1 ; mm6=BEH
+ psrld mm0, 1 ; mm0=BEL*FIX(0.500)
+ psrld mm6, 1 ; mm6=BEH*FIX(0.500)
+
+ movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+ paddd mm5, mm0
+ paddd mm4, mm6
+ paddd mm5, mm1
+ paddd mm4, mm1
+ psrld mm5, SCALEBITS ; mm5=CbEL
+ psrld mm4, SCALEBITS ; mm4=CbEH
+ packssdw mm5, mm4 ; mm5=CbE
+
+ psllw mm7, BYTE_BIT
+ por mm5, mm7 ; mm5=Cb
+ movq MMWORD [ebx], mm5 ; Save Cb
+
+ movq mm0, MMWORD [wk(3)] ; mm0=BO
+ movq mm6, MMWORD [wk(2)] ; mm6=BE
+ movq mm1, MMWORD [wk(1)] ; mm1=RO
+
+ movq mm4, mm0
+ punpcklwd mm0, mm3
+ punpckhwd mm4, mm3
+ movq mm7, mm0
+ movq mm5, mm4
+ pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd mm7, [GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
+
+ paddd mm0, MMWORD [wk(4)]
+ paddd mm4, MMWORD [wk(5)]
+ paddd mm0, mm3
+ paddd mm4, mm3
+ psrld mm0, SCALEBITS ; mm0=YOL
+ psrld mm4, SCALEBITS ; mm4=YOH
+ packssdw mm0, mm4 ; mm0=YO
+
+ pxor mm3, mm3
+ pxor mm4, mm4
+ punpcklwd mm3, mm1 ; mm3=ROL
+ punpckhwd mm4, mm1 ; mm4=ROH
+ psrld mm3, 1 ; mm3=ROL*FIX(0.500)
+ psrld mm4, 1 ; mm4=ROH*FIX(0.500)
+
+ movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+ paddd mm7, mm3
+ paddd mm5, mm4
+ paddd mm7, mm1
+ paddd mm5, mm1
+ psrld mm7, SCALEBITS ; mm7=CrOL
+ psrld mm5, SCALEBITS ; mm5=CrOH
+ packssdw mm7, mm5 ; mm7=CrO
+
+ movq mm3, MMWORD [wk(0)] ; mm3=RE
+
+ movq mm4, mm6
+ punpcklwd mm6, mm2
+ punpckhwd mm4, mm2
+ movq mm1, mm6
+ movq mm5, mm4
+ pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd mm1, [GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
+
+ paddd mm6, MMWORD [wk(6)]
+ paddd mm4, MMWORD [wk(7)]
+ paddd mm6, mm2
+ paddd mm4, mm2
+ psrld mm6, SCALEBITS ; mm6=YEL
+ psrld mm4, SCALEBITS ; mm4=YEH
+ packssdw mm6, mm4 ; mm6=YE
+
+ psllw mm0, BYTE_BIT
+ por mm6, mm0 ; mm6=Y
+ movq MMWORD [edi], mm6 ; Save Y
+
+ pxor mm2, mm2
+ pxor mm4, mm4
+ punpcklwd mm2, mm3 ; mm2=REL
+ punpckhwd mm4, mm3 ; mm4=REH
+ psrld mm2, 1 ; mm2=REL*FIX(0.500)
+ psrld mm4, 1 ; mm4=REH*FIX(0.500)
+
+ movq mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
+
+ paddd mm1, mm2
+ paddd mm5, mm4
+ paddd mm1, mm0
+ paddd mm5, mm0
+ psrld mm1, SCALEBITS ; mm1=CrEL
+ psrld mm5, SCALEBITS ; mm5=CrEH
+ packssdw mm1, mm5 ; mm1=CrE
+
+ psllw mm7, BYTE_BIT
+ por mm1, mm7 ; mm1=Cr
+ movq MMWORD [edx], mm1 ; Save Cr
+
+ sub ecx, byte SIZEOF_MMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
+ add edi, byte SIZEOF_MMWORD ; outptr0
+ add ebx, byte SIZEOF_MMWORD ; outptr1
+ add edx, byte SIZEOF_MMWORD ; outptr2
+ cmp ecx, byte SIZEOF_MMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ pop ebx
+ pop edx
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jccolext-sse2.asm b/media/libjpeg/simd/i386/jccolext-sse2.asm
new file mode 100644
index 0000000000..c6c80852ac
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-sse2.asm
@@ -0,0 +1,503 @@
+;
+; jccolext.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION img_width
+%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b) + 20 ; JDIMENSION output_row
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 8
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)]
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ pushpic eax
+ push edx
+ push ebx
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ mov ebx, JSAMPROW [ebx] ; outptr1
+ mov edx, JSAMPROW [edx] ; outptr2
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ alignx 16, 7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ movzx eax, byte [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ movzx edx, word [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
+.column_ld4:
+ movd xmmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF, xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .rgb_ycc_cnv
+.column_ld32:
+ test cl, 2*SIZEOF_XMMWORD
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmB, xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
+
+.columnloop:
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ movdqa xmmG, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+ punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+ movdqa xmmD, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+ punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+ movdqa xmmE, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+ punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+ pxor xmmH, xmmH
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmB, xmmE
+ punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+ movdqa xmmF, xmmD
+ punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmE
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE, xmmA
+ movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmF, xmmA
+ movdqa xmmH, xmmE
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
+
+.columnloop:
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC, xmmF
+ punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB, xmmA
+ punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG, xmmD
+ punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE, xmmA
+ punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH, xmmB
+ punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF, xmmF
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD, xmmB
+ punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG, xmmE
+ punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF, xmmH
+ punpckhbw xmmH, xmmH
+ psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
+ movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
+
+ movdqa xmm6, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm6, xmm3
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm6
+ pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd xmm7, [GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor xmm1, xmm1
+ pxor xmm6, xmm6
+ punpcklwd xmm1, xmm5 ; xmm1=BOL
+ punpckhwd xmm6, xmm5 ; xmm6=BOH
+ psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
+ psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
+
+ movdqa xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7, xmm1
+ paddd xmm4, xmm6
+ paddd xmm7, xmm5
+ paddd xmm4, xmm5
+ psrld xmm7, SCALEBITS ; xmm7=CbOL
+ psrld xmm4, SCALEBITS ; xmm4=CbOH
+ packssdw xmm7, xmm4 ; xmm7=CbO
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
+
+ movdqa xmm6, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm6, xmm2
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm6
+ pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor xmm0, xmm0
+ pxor xmm6, xmm6
+ punpcklwd xmm0, xmm1 ; xmm0=BEL
+ punpckhwd xmm6, xmm1 ; xmm6=BEH
+ psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
+ psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
+
+ movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm5, xmm0
+ paddd xmm4, xmm6
+ paddd xmm5, xmm1
+ paddd xmm4, xmm1
+ psrld xmm5, SCALEBITS ; xmm5=CbEL
+ psrld xmm4, SCALEBITS ; xmm4=CbEH
+ packssdw xmm5, xmm4 ; xmm5=CbE
+
+ psllw xmm7, BYTE_BIT
+ por xmm5, xmm7 ; xmm5=Cb
+ movdqa XMMWORD [ebx], xmm5 ; Save Cb
+
+ movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm4, xmm3
+ movdqa xmm7, xmm0
+ movdqa xmm5, xmm4
+ pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd xmm7, [GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, XMMWORD [wk(4)]
+ paddd xmm4, XMMWORD [wk(5)]
+ paddd xmm0, xmm3
+ paddd xmm4, xmm3
+ psrld xmm0, SCALEBITS ; xmm0=YOL
+ psrld xmm4, SCALEBITS ; xmm4=YOH
+ packssdw xmm0, xmm4 ; xmm0=YO
+
+ pxor xmm3, xmm3
+ pxor xmm4, xmm4
+ punpcklwd xmm3, xmm1 ; xmm3=ROL
+ punpckhwd xmm4, xmm1 ; xmm4=ROH
+ psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
+ psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
+
+ movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7, xmm3
+ paddd xmm5, xmm4
+ paddd xmm7, xmm1
+ paddd xmm5, xmm1
+ psrld xmm7, SCALEBITS ; xmm7=CrOL
+ psrld xmm5, SCALEBITS ; xmm5=CrOH
+ packssdw xmm7, xmm5 ; xmm7=CrO
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
+
+ movdqa xmm4, xmm6
+ punpcklwd xmm6, xmm2
+ punpckhwd xmm4, xmm2
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm4
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd xmm1, [GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(6)]
+ paddd xmm4, XMMWORD [wk(7)]
+ paddd xmm6, xmm2
+ paddd xmm4, xmm2
+ psrld xmm6, SCALEBITS ; xmm6=YEL
+ psrld xmm4, SCALEBITS ; xmm4=YEH
+ packssdw xmm6, xmm4 ; xmm6=YE
+
+ psllw xmm0, BYTE_BIT
+ por xmm6, xmm0 ; xmm6=Y
+ movdqa XMMWORD [edi], xmm6 ; Save Y
+
+ pxor xmm2, xmm2
+ pxor xmm4, xmm4
+ punpcklwd xmm2, xmm3 ; xmm2=REL
+ punpckhwd xmm4, xmm3 ; xmm4=REH
+ psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
+ psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
+
+ movdqa xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
+
+ paddd xmm1, xmm2
+ paddd xmm5, xmm4
+ paddd xmm1, xmm0
+ paddd xmm5, xmm0
+ psrld xmm1, SCALEBITS ; xmm1=CrEL
+ psrld xmm5, SCALEBITS ; xmm5=CrEH
+ packssdw xmm1, xmm5 ; xmm1=CrE
+
+ psllw xmm7, BYTE_BIT
+ por xmm1, xmm7 ; xmm1=Cr
+ movdqa XMMWORD [edx], xmm1 ; Save Cr
+
+ sub ecx, byte SIZEOF_XMMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add edi, byte SIZEOF_XMMWORD ; outptr0
+ add ebx, byte SIZEOF_XMMWORD ; outptr1
+ add edx, byte SIZEOF_XMMWORD ; outptr2
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ pop ebx
+ pop edx
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jccolor-avx2.asm b/media/libjpeg/simd/i386/jccolor-avx2.asm
new file mode 100644
index 0000000000..14944e952f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-avx2.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PW_MF016_MF033 times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041 times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \
+ (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jccolor-mmx.asm b/media/libjpeg/simd/i386/jccolor-mmx.asm
new file mode 100644
index 0000000000..8cb399bdc4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-mmx.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337 times 2 dw F_0_299, F_0_337
+PW_F0114_F0250 times 2 dw F_0_114, F_0_250
+PW_MF016_MF033 times 2 dw -F_0_168, -F_0_331
+PW_MF008_MF041 times 2 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS - 1)) - 1 + \
+ (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
+%include "jccolext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jccolor-sse2.asm b/media/libjpeg/simd/i386/jccolor-sse2.asm
new file mode 100644
index 0000000000..686d222ff7
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-sse2.asm
@@ -0,0 +1,120 @@
+;
+; jccolor.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PW_MF016_MF033 times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041 times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \
+ (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-avx2.asm b/media/libjpeg/simd/i386/jcgray-avx2.asm
new file mode 100644
index 0000000000..560ee0c71e
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-avx2.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-mmx.asm b/media/libjpeg/simd/i386/jcgray-mmx.asm
new file mode 100644
index 0000000000..79fdf082a8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-mmx.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_gray_convert_mmx)
+
+EXTN(jconst_rgb_gray_convert_mmx):
+
+PW_F0299_F0337 times 2 dw F_0_299, F_0_337
+PW_F0114_F0250 times 2 dw F_0_114, F_0_250
+PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx
+%include "jcgryext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-sse2.asm b/media/libjpeg/simd/i386/jcgray-sse2.asm
new file mode 100644
index 0000000000..cb4b28e8f4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-sse2.asm
@@ -0,0 +1,112 @@
+;
+; jcgray.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jcgryext-avx2.asm b/media/libjpeg/simd/i386/jcgryext-avx2.asm
new file mode 100644
index 0000000000..3fa7973d72
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-avx2.asm
@@ -0,0 +1,457 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION img_width
+%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b) + 20 ; JDIMENSION output_row
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+ ; ymmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)]
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ pushpic eax
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ alignx 16, 7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ movzx eax, byte [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ movzx edx, word [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
+.column_ld4:
+ vmovd xmmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ vmovd xmmF, XMM_DWORD [esi+ecx]
+ vpslldq xmmA, xmmA, SIZEOF_DWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_MMWORD
+ vmovq xmmB, XMM_MMWORD [esi+ecx]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ sub ecx, byte SIZEOF_XMMWORD
+ vmovdqu xmmB, XMM_MMWORD [esi+ecx]
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ vpor ymmA, ymmB
+.column_ld32:
+ test cl, SIZEOF_YMMWORD
+ jz short .column_ld64
+ sub ecx, byte SIZEOF_YMMWORD
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+ test cl, 2*SIZEOF_YMMWORD
+ mov ecx, SIZEOF_YMMWORD
+ jz short .rgb_gray_cnv
+ vmovdqa ymmB, ymmA
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+ ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vmovdqu ymmC, ymmA
+ vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+ vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+ vmovdqa ymmG, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+ ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+ vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+ ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+ ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+ vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+ ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+ vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+ ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+ vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+ ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+ vmovdqa ymmD, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+ ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+ vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+ ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+ vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+ ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+ ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+ ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+ vmovdqa ymmE, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+ ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+ vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+ ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+ ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+ ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+ vpxor ymmH, ymmH, ymmH
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmB, ymmE
+ vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+ vmovdqa ymmF, ymmD
+ vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_XMMWORD/16
+ vmovd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ vmovq xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_XMMWORD/4
+ vmovdqa xmmF, xmmA
+ vperm2i128 ymmF, ymmF, ymmF, 1
+ vmovdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+ vpor ymmA, ymmA, ymmF
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_XMMWORD/2
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ mov ecx, SIZEOF_YMMWORD
+ jz short .rgb_gray_cnv
+ vmovdqa ymmE, ymmA
+ vmovdqa ymmH, ymmF
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+ vmovdqu ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+ ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmB, ymmA
+ vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+ vmovdqa ymmB, ymmF
+ vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+ vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmD, ymmA
+ vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+ ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+ vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+ ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+ vmovdqa ymmC, ymmF
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+ ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+ ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+ vmovdqa ymmB, ymmA
+ vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+ vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+ ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+ vmovdqa ymmG, ymmD
+ vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+ ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+ vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+ ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+ vmovdqa ymmE, ymmA
+ vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vmovdqa ymmH, ymmB
+ vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+ ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+ ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+ vpxor ymmF, ymmF, ymmF
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmD, ymmB
+ vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+ vmovdqa ymmG, ymmE
+ vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vpunpcklbw ymmF, ymmF, ymmH
+ vpunpckhbw ymmH, ymmH, ymmH
+ vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+ vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+ ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+ vmovdqa ymm6, ymm1
+ vpunpcklwd ymm1, ymm1, ymm3
+ vpunpckhwd ymm6, ymm6, ymm3
+ vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ vmovdqa ymm6, ymm0
+ vpunpcklwd ymm0, ymm0, ymm2
+ vpunpckhwd ymm6, ymm6, ymm2
+ vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+ vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ vmovdqa ymm0, ymm5 ; ymm0=BO
+ vmovdqa ymm6, ymm4 ; ymm6=BE
+
+ vmovdqa ymm4, ymm0
+ vpunpcklwd ymm0, ymm0, ymm3
+ vpunpckhwd ymm4, ymm4, ymm3
+ vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+ vmovdqa ymm3, [GOTOFF(eax,PD_ONEHALF)] ; ymm3=[PD_ONEHALF]
+
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm4, ymm4, ymm7
+ vpaddd ymm0, ymm0, ymm3
+ vpaddd ymm4, ymm4, ymm3
+ vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
+ vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
+
+ vmovdqa ymm4, ymm6
+ vpunpcklwd ymm6, ymm6, ymm2
+ vpunpckhwd ymm4, ymm4, ymm2
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+ vmovdqa ymm2, [GOTOFF(eax,PD_ONEHALF)] ; ymm2=[PD_ONEHALF]
+
+ vpaddd ymm6, ymm6, YMMWORD [wk(0)]
+ vpaddd ymm4, ymm4, YMMWORD [wk(1)]
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm4, ymm4, ymm2
+ vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
+ vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
+
+ vpsllw ymm0, ymm0, BYTE_BIT
+ vpor ymm6, ymm6, ymm0 ; ymm6=Y
+ vmovdqu YMMWORD [edi], ymm6 ; Save Y
+
+ sub ecx, byte SIZEOF_YMMWORD
+ add esi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
+ add edi, byte SIZEOF_YMMWORD ; outptr0
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jcgryext-mmx.asm b/media/libjpeg/simd/i386/jcgryext-mmx.asm
new file mode 100644
index 0000000000..8af42e5a33
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-mmx.asm
@@ -0,0 +1,355 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION img_width
+%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b) + 20 ; JDIMENSION output_row
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx)
+
+EXTN(jsimd_rgb_gray_convert_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)] ; num_cols
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ pushpic eax
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jae short .columnloop
+ alignx 16, 7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ xor eax, eax
+ mov al, byte [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ xor edx, edx
+ mov dx, word [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
+.column_ld4:
+ movd mmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd mmG, dword [esi+ecx]
+ psllq mmA, DWORD_BIT
+ por mmA, mmG
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ movq mmG, mmA
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ mov ecx, SIZEOF_MMWORD
+ jmp short .rgb_gray_cnv
+.column_ld16:
+ test cl, 2*SIZEOF_MMWORD
+ mov ecx, SIZEOF_MMWORD
+ jz short .rgb_gray_cnv
+ movq mmF, mmA
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
+
+.columnloop:
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+ ; mmA=(00 10 20 01 11 21 02 12)
+ ; mmG=(22 03 13 23 04 14 24 05)
+ ; mmF=(15 25 06 16 26 07 17 27)
+
+ movq mmD, mmA
+ psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
+ psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
+
+ punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05)
+ psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
+
+ punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16)
+ punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27)
+
+ movq mmE, mmA
+ psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
+ psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
+
+ punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
+ psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
+
+ punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07)
+ punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27)
+
+ pxor mmH, mmH
+
+ movq mmC, mmA
+ punpcklbw mmA, mmH ; mmA=(00 02 04 06)
+ punpckhbw mmC, mmH ; mmC=(10 12 14 16)
+
+ movq mmB, mmE
+ punpcklbw mmE, mmH ; mmE=(20 22 24 26)
+ punpckhbw mmB, mmH ; mmB=(01 03 05 07)
+
+ movq mmF, mmD
+ punpcklbw mmD, mmH ; mmD=(11 13 15 17)
+ punpckhbw mmF, mmH ; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_MMWORD/8
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_MMWORD/8
+ movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_MMWORD/4
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_MMWORD/4
+ movq mmF, mmA
+ movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+ test cl, SIZEOF_MMWORD/2
+ mov ecx, SIZEOF_MMWORD
+ jz short .rgb_gray_cnv
+ movq mmD, mmA
+ movq mmC, mmF
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
+
+.columnloop:
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+ movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+ ; mmA=(00 10 20 30 01 11 21 31)
+ ; mmF=(02 12 22 32 03 13 23 33)
+ ; mmD=(04 14 24 34 05 15 25 35)
+ ; mmC=(06 16 26 36 07 17 27 37)
+
+ movq mmB, mmA
+ punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32)
+ punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33)
+
+ movq mmG, mmD
+ punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36)
+ punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37)
+
+ movq mmE, mmA
+ punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
+ punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36)
+
+ movq mmH, mmB
+ punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17)
+ punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37)
+
+ pxor mmF, mmF
+
+ movq mmC, mmA
+ punpcklbw mmA, mmF ; mmA=(00 02 04 06)
+ punpckhbw mmC, mmF ; mmC=(10 12 14 16)
+
+ movq mmD, mmB
+ punpcklbw mmB, mmF ; mmB=(01 03 05 07)
+ punpckhbw mmD, mmF ; mmD=(11 13 15 17)
+
+ movq mmG, mmE
+ punpcklbw mmE, mmF ; mmE=(20 22 24 26)
+ punpckhbw mmG, mmF ; mmG=(30 32 34 36)
+
+ punpcklbw mmF, mmH
+ punpckhbw mmH, mmH
+ psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27)
+ psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+ ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+ movq mm6, mm1
+ punpcklwd mm1, mm3
+ punpckhwd mm6, mm3
+ pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movq mm6, mm0
+ punpcklwd mm0, mm2
+ punpckhwd mm6, mm2
+ pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movq mm0, mm5 ; mm0=BO
+ movq mm6, mm4 ; mm6=BE
+
+ movq mm4, mm0
+ punpcklwd mm0, mm3
+ punpckhwd mm4, mm3
+ pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+ movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
+
+ paddd mm0, mm1
+ paddd mm4, mm7
+ paddd mm0, mm3
+ paddd mm4, mm3
+ psrld mm0, SCALEBITS ; mm0=YOL
+ psrld mm4, SCALEBITS ; mm4=YOH
+ packssdw mm0, mm4 ; mm0=YO
+
+ movq mm4, mm6
+ punpcklwd mm6, mm2
+ punpckhwd mm4, mm2
+ pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+ movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
+
+ paddd mm6, MMWORD [wk(0)]
+ paddd mm4, MMWORD [wk(1)]
+ paddd mm6, mm2
+ paddd mm4, mm2
+ psrld mm6, SCALEBITS ; mm6=YEL
+ psrld mm4, SCALEBITS ; mm4=YEH
+ packssdw mm6, mm4 ; mm6=YE
+
+ psllw mm0, BYTE_BIT
+ por mm6, mm0 ; mm6=Y
+ movq MMWORD [edi], mm6 ; Save Y
+
+ sub ecx, byte SIZEOF_MMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
+ add edi, byte SIZEOF_MMWORD ; outptr0
+ cmp ecx, byte SIZEOF_MMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jcgryext-sse2.asm b/media/libjpeg/simd/i386/jcgryext-sse2.asm
new file mode 100644
index 0000000000..c9d6ff1e35
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-sse2.asm
@@ -0,0 +1,382 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION img_width
+%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b) + 20 ; JDIMENSION output_row
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)]
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ pushpic eax
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ alignx 16, 7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ movzx eax, byte [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ movzx edx, word [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
+.column_ld4:
+ movd xmmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF, xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .rgb_gray_cnv
+.column_ld32:
+ test cl, 2*SIZEOF_XMMWORD
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_gray_cnv
+ movdqa xmmB, xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
+
+.columnloop:
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ movdqa xmmG, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+ punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+ movdqa xmmD, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+ punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+ movdqa xmmE, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+ punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+ pxor xmmH, xmmH
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmB, xmmE
+ punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+ movdqa xmmF, xmmD
+ punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmE
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE, xmmA
+ movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_gray_cnv
+ movdqa xmmF, xmmA
+ movdqa xmmH, xmmE
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
+
+.columnloop:
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC, xmmF
+ punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB, xmmA
+ punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG, xmmD
+ punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE, xmmA
+ punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH, xmmB
+ punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF, xmmF
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD, xmmB
+ punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG, xmmE
+ punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF, xmmH
+ punpckhbw xmmH, xmmH
+ psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+ movdqa xmm6, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm6, xmm3
+ pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movdqa xmm6, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm6, xmm2
+ pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movdqa xmm0, xmm5 ; xmm0=BO
+ movdqa xmm6, xmm4 ; xmm6=BE
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm4, xmm3
+ pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+ movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, xmm1
+ paddd xmm4, xmm7
+ paddd xmm0, xmm3
+ paddd xmm4, xmm3
+ psrld xmm0, SCALEBITS ; xmm0=YOL
+ psrld xmm4, SCALEBITS ; xmm4=YOH
+ packssdw xmm0, xmm4 ; xmm0=YO
+
+ movdqa xmm4, xmm6
+ punpcklwd xmm6, xmm2
+ punpckhwd xmm4, xmm2
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+ movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(0)]
+ paddd xmm4, XMMWORD [wk(1)]
+ paddd xmm6, xmm2
+ paddd xmm4, xmm2
+ psrld xmm6, SCALEBITS ; xmm6=YEL
+ psrld xmm4, SCALEBITS ; xmm4=YEH
+ packssdw xmm6, xmm4 ; xmm6=YE
+
+ psllw xmm0, BYTE_BIT
+ por xmm6, xmm0 ; xmm6=Y
+ movdqa XMMWORD [edi], xmm6 ; Save Y
+
+ sub ecx, byte SIZEOF_XMMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add edi, byte SIZEOF_XMMWORD ; outptr0
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jchuff-sse2.asm b/media/libjpeg/simd/i386/jchuff-sse2.asm
new file mode 100644
index 0000000000..278cf5e83a
--- /dev/null
+++ b/media/libjpeg/simd/i386/jchuff-sse2.asm
@@ -0,0 +1,761 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based on jchuff.c; see jchuff.c for more details.
+
+%include "jsimdext.inc"
+
+struc working_state
+.next_output_byte: resp 1 ; => next byte to write in buffer
+.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1 ; current bit accumulation buffer
+.cur.free_bits resd 1 ; # of bits available in it
+.cur.last_dc_val resd 4 ; last DC coef for each component
+.cinfo: resp 1 ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco: resd 256 ; code for each symbol
+.ehufsi: resb 256 ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+ alignz 32
+
+jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
+ dq 0x000f, 0x001f, 0x003f, 0x007f
+ dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
+ dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 << 9 db 10
+times 1 << 8 db 9
+times 1 << 7 db 8
+times 1 << 6 db 7
+times 1 << 5 db 6
+times 1 << 4 db 5
+times 1 << 3 db 4
+times 1 << 2 db 3
+times 1 << 1 db 2
+times 1 << 0 db 1
+times 1 db 0
+jpeg_nbits_table:
+times 1 db 0
+times 1 << 0 db 1
+times 1 << 1 db 2
+times 1 << 2 db 3
+times 1 << 3 db 4
+times 1 << 4 db 5
+times 1 << 5 db 6
+times 1 << 6 db 7
+times 1 << 7 db 8
+times 1 << 8 db 9
+times 1 << 9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+
+ alignz 32
+
+%ifdef PIC
+%define NBITS(x) nbits_base + x
+%else
+%define NBITS(x) jpeg_nbits_table + x
+%endif
+%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%define mm_put_buffer mm0
+%define mm_all_0xff mm1
+%define mm_temp mm2
+%define mm_nbits mm3
+%define mm_code_bits mm3
+%define mm_code mm4
+%define mm_overflow_bits mm5
+%define mm_save_nbits mm6
+
+; Shorthand used to describe SIMD operations:
+; wN: xmmN treated as eight signed 16-bit values
+; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
+; bN: xmmN treated as 16 unsigned 8-bit values, or
+; mmN treated as eight unsigned 8-bit values
+; bN[i]: perform the same operation on all unsigned 8-bit values,
+; i=0..15 (SSE register) or i=0..7 (MMX register)
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - temp register
+; %2 - low byte of temp register
+; %3 - second byte of temp register
+; %4-%8 (optional) - extra instructions to execute before the macro completes
+; %9 - the label to which to jump when the macro completes
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits. temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 9
+%define %%temp %1
+%define %%tempb %2
+%define %%temph %3
+ add nbits, free_bits ; nbits += free_bits;
+ neg free_bits ; free_bits = -free_bits;
+ movq mm_temp, mm_code ; temp = code;
+ movd mm_nbits, nbits ; nbits --> MMX register
+ movd mm_overflow_bits, free_bits ; overflow_bits (temp register) = free_bits;
+ neg free_bits ; free_bits = -free_bits;
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ psrlq mm_temp, mm_overflow_bits ; temp >>= overflow_bits;
+ add free_bits, 64 ; free_bits += 64;
+ por mm_temp, mm_put_buffer ; temp |= put_buffer;
+%ifidn %%temp, nbits_base
+ movd mm_save_nbits, nbits_base ; save nbits_base
+%endif
+ movq mm_code_bits, mm_temp ; code_bits (temp register) = temp;
+ movq mm_put_buffer, mm_code ; put_buffer = code;
+ pcmpeqb mm_temp, mm_all_0xff ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
+ movq mm_code, mm_code_bits ; code = code_bits;
+ psrlq mm_code_bits, 32 ; code_bits >>= 32;
+ pmovmskb nbits, mm_temp ; nbits = 0; nbits |= ((b_temp[i] >> 7) << i);
+ movd %%temp, mm_code_bits ; temp = code_bits;
+ bswap %%temp ; temp = htonl(temp);
+ test nbits, nbits ; if (nbits != 0) /* Some 0xFF bytes */
+ jnz %%.SLOW ; goto %%.SLOW
+ mov dword [buffer], %%temp ; *(uint32_t)buffer = temp;
+%ifidn %%temp, nbits_base
+ movd nbits_base, mm_save_nbits ; restore nbits_base
+%endif
+ %4
+ movd nbits, mm_code ; nbits = (uint32_t)(code);
+ %5
+ bswap nbits ; nbits = htonl(nbits);
+ mov dword [buffer + 4], nbits ; *(uint32_t)(buffer + 4) = nbits;
+ lea buffer, [buffer + 8] ; buffer += 8;
+ %6
+ %7
+ %8
+ jmp %9 ; return
+%%.SLOW:
+ ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+ ; bytes in the qword.
+ mov byte [buffer], %%tempb ; buffer[0] = temp[0];
+ cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], %%temph ; buffer[0] = temp[1];
+ cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ shr %%temp, 16 ; temp >>= 16;
+ mov byte [buffer], %%tempb ; buffer[0] = temp[0];
+ cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], %%temph ; buffer[0] = temp[1];
+ cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ movd nbits, mm_code ; nbits (temp register) = (uint32_t)(code)
+%ifidn %%temp, nbits_base
+ movd nbits_base, mm_save_nbits ; restore nbits_base
+%endif
+ bswap nbits ; nbits = htonl(nbits)
+ mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
+ cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
+ cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+ shr nbits, 16 ; nbits >>= 16;
+ mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
+ cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
+ %4
+ cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+ %5
+ %6
+ %7
+ %8
+ jmp %9 ; return;
+%endmacro
+
+%macro PUSH 1
+ push %1
+%assign stack_offset stack_offset + 4
+%endmacro
+
+%macro POP 1
+ pop %1
+%assign stack_offset stack_offset - 4
+%endmacro
+
+; If PIC is defined, load the address of a symbol defined in this file into a
+; register. Equivalent to
+; get_GOT %1
+; lea %1, [GOTOFF(%1, %2)]
+; without using the GOT.
+;
+; Usage:
+; %1 - register into which to load the address of the symbol
+; %2 - symbol whose address should be loaded
+; %3 - optional multi-line macro to execute before the symbol address is loaded
+; %4 - optional multi-line macro to execute after the symbol address is loaded
+;
+; If PIC is not defined, then %3 and %4 are executed in order.
+
+%macro GET_SYM 2-4
+%ifdef PIC
+ call %%.geteip
+%%.ref:
+ %4
+ add %1, %2 - %%.ref
+ jmp short %%.done
+ align 32
+%%.geteip:
+ %3 4 ; must adjust stack pointer because of call
+ mov %1, POINTER [esp]
+ ret
+ align 32
+%%.done:
+%else
+ %3 0
+ %4
+%endif
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+; JCOEFPTR block, int last_dc_val,
+; c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+; Stack layout:
+; Function args
+; Return address
+; Saved ebx
+; Saved ebp
+; Saved esi
+; Saved edi <-- esp_save
+; ...
+; esp_save
+; t_ 64*2 bytes (aligned to 128 bytes)
+;
+; esp is used (as t) to point into t_ (data in lower indices is not used once
+; esp passes over them, so this is signal-safe.) Aligning to 128 bytes allows
+; us to find the rest of the data again.
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel. In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support. The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.) This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; eax - frame --> buffer
+; ebx - nbits_base (PIC) / emit_temp
+; ecx - dctbl --> size --> state
+; edx - block --> nbits
+; esi - code_temp --> state --> actbl
+; edi - index_temp --> free_bits
+; esp - t
+; ebp - index
+
+%define frame eax
+%ifdef PIC
+%define nbits_base ebx
+%endif
+%define emit_temp ebx
+%define emit_tempb bl
+%define emit_temph bh
+%define dctbl ecx
+%define block edx
+%define code_temp esi
+%define index_temp edi
+%define t esp
+%define index ebp
+
+%assign save_frame DCTSIZE2 * SIZEOF_WORD
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+
+%assign stack_offset 0
+%define arg_state 4 + stack_offset
+%define arg_buffer 8 + stack_offset
+%define arg_block 12 + stack_offset
+%define arg_last_dc_val 16 + stack_offset
+%define arg_dctbl 20 + stack_offset
+%define arg_actbl 24 + stack_offset
+
+ ;X: X = code stream
+ mov block, [esp + arg_block]
+ PUSH ebx
+ PUSH ebp
+ movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
+ PUSH esi
+ PUSH edi
+ movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
+ mov frame, esp
+ lea t, [frame - (save_frame + 4)]
+ movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
+ and t, -DCTSIZE2 * SIZEOF_WORD ; t = &t_[0]
+ mov [t + save_frame], frame
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
+ pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
+ pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
+ punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
+ punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
+ pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
+ ;A: (Row 0, offset 1)
+ pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+ paddw xmm0, xmm4 ;A: w0[i] += w4[i];
+ movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
+
+ movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
+ pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
+ pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
+ movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
+ movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
+ punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
+ pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
+ pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+ pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
+ ; (Row 1, offset 1)
+ pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+ paddw xmm1, xmm4 ;B: w1[i] += w4[i];
+ movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
+ pxor xmm4, xmm4 ;B: w4[i] = 0;
+ pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+ packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+ ; w/ signed saturation
+
+ pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
+ pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
+ pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
+ pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
+ ; (Row 3, offset 1)
+ pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+ paddw xmm3, xmm4 ;D: w3[i] += w4[i];
+ movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
+ pxor xmm4, xmm4 ;D: w4[i] = 0;
+ pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+ pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
+ pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
+ pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
+ pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
+ pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
+ pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
+ ; (Row 2, offset 1)
+ pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+ paddw xmm2, xmm4 ;C: w2[i] += w4[i];
+ movsx code_temp, word [block] ;Z: code_temp = block[0];
+
+; %1 - stack pointer adjustment
+%macro GET_SYM_BEFORE 1
+ movaps XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
+ ;C: t[i+16] = w2[i];
+ pxor xmm4, xmm4 ;C: w4[i] = 0;
+ pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+ sub code_temp, [frame + arg_last_dc_val] ;Z: code_temp -= last_dc_val;
+
+ packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+ ; w/ signed saturation
+
+ movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
+ pmovmskb index_temp, xmm2 ;Z: index_temp = 0; index_temp |= ((b2[i] >> 7) << i);
+ pmovmskb index, xmm0 ;Z: index = 0; index |= ((b0[i] >> 7) << i);
+ movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
+ punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
+ shl index_temp, 16 ;Z: index_temp <<= 16;
+ psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
+ pxor xmm2, xmm2 ;H: w2[i] = 0;
+ pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
+ or index, index_temp ;Z: index |= index_temp;
+%undef index_temp
+%define free_bits edi
+%endmacro
+
+%macro GET_SYM_AFTER 0
+ movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
+ unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
+ pxor xmm0, xmm0 ;H: w0[i] = 0;
+ not index ;Z: index = ~index;
+ pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
+ ; (Row 7, offset 1)
+ pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+ mov dctbl, [frame + arg_dctbl]
+ paddw xmm3, xmm2 ;H: w3[i] += w2[i];
+ movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
+ movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
+ pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+ punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
+ movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
+ pcmpeqw mm_all_0xff, mm_all_0xff ;Z: all_0xff[i] = 0xFF;
+%endmacro
+
+ GET_SYM nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
+
+ psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
+ shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
+ pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
+ pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
+ pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
+ pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
+ cmp code_temp, 1 << 31 ;Z: Set CF if code_temp < 0x80000000,
+ ;Z: i.e. if code_temp is positive
+ pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
+ movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
+ pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
+ pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
+ pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
+ ; (Row 6, offset 1)
+ adc code_temp, -1 ;Z: code_temp += -1 + (code_temp >= 0 ? 1 : 0);
+ pxor xmm2, xmm2 ;G: w2[i] = 0;
+ pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+ pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
+ paddw xmm4, xmm0 ;G: w4[i] += w0[i];
+ movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
+ movd mm_temp, code_temp ;Z: temp = code_temp
+ pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
+ ; (Row 5, offset 1)
+ pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+
+ packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+ ; w/ signed saturation
+
+ lea t, [t - SIZEOF_WORD] ;Z: t = &t[-1]
+ pxor xmm0, xmm0 ;F: w0[i] = 0;
+ pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+ paddw xmm1, xmm2 ;F: w1[i] += w2[i];
+ movaps XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
+ pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+ pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
+ pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
+ pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
+ pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
+ ; (Row 4, offset 1)
+%undef block
+%define nbits edx
+%define nbitsb dl
+%define nbitsh dh
+ movzx nbits, byte [NBITS(code_temp)] ;Z: nbits = JPEG_NBITS(code_temp);
+%undef code_temp
+%define state esi
+ pxor xmm2, xmm2 ;E: w2[i] = 0;
+ mov state, [frame + arg_state]
+ movd mm_nbits, nbits ;Z: nbits --> MMX register
+ pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+ movd mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
+ ;Z: code = dctbl->ehufco[nbits];
+%define size ecx
+%define sizeb cl
+%define sizeh ch
+ paddw xmm5, xmm0 ;E: w5[i] += w0[i];
+ movaps XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
+ movzx size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
+ ;Z: size = dctbl->ehufsi[nbits];
+%undef dctbl
+ pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+ packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+ ; w/ signed saturation
+
+ movq mm_put_buffer, [state + working_state.cur.put_buffer.simd]
+ ;Z: put_buffer = state->cur.put_buffer.simd;
+ mov free_bits, [state + working_state.cur.free_bits]
+ ;Z: free_bits = state->cur.free_bits;
+%undef state
+%define actbl esi
+ mov actbl, [frame + arg_actbl]
+%define buffer eax
+ mov buffer, [frame + arg_buffer]
+%undef frame
+ jmp .BEGIN
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+; size <= 32, so this is not really a loop
+.BRLOOP1: ; .BRLOOP1:
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+ ; nbits = actbl->ehufsi[0xf0];
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+ ; code = actbl->ehufco[0xf0];
+ and index, 0x7ffffff ; clear index if size == 32
+ sub size, 16 ; size -= 16;
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_BRLOOP1 ; goto .EMIT_BRLOOP1;
+ movd mm_nbits, nbits ; nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+ jmp .ERLOOP1 ; goto .ERLOOP1;
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+%ifdef PIC
+ times 6 nop
+%else
+ times 2 nop
+%endif
+.BLOOP1: ; do { /* size = # of zero bits/elements to skip */
+; if size == 32, index remains unchanged. Correct in .BRLOOP.
+ shr index, sizeb ; index >>= size;
+ lea t, [t + size * SIZEOF_WORD] ; t += size;
+ cmp size, 16 ; if (size > 16)
+ jg .BRLOOP1 ; goto .BRLOOP1;
+.ERLOOP1: ; .ERLOOP1:
+ movsx nbits, word [t] ; nbits = *t;
+%ifdef PIC
+ add size, size ; size += size;
+%else
+ lea size, [size * 2] ; size += size;
+%endif
+ movd mm_temp, nbits ; temp = nbits;
+ movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
+ lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
+ movd mm_nbits, nbits ; nbits --> MMX register
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+ ; code = actbl->ehufco[size-16];
+ movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+ ; size = actbl->ehufsi[size-16];
+.BEGIN: ; .BEGIN:
+ pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
+ psllq mm_code, mm_nbits ; code <<= nbits;
+ add nbits, size ; nbits += size;
+ por mm_code, mm_temp ; code |= temp;
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_ERLOOP1 ; insert code, flush buffer, init size, goto .BLOOP1
+ xor size, size ; size = 0; /* kill tzcnt input dependency */
+ tzcnt size, index ; size = # of trailing 0 bits in index
+ movd mm_nbits, nbits ; nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ inc size ; ++size;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+ test index, index
+ jnz .BLOOP1 ; } while (index != 0);
+; Round 2
+; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
+.ELOOP1: ; .ELOOP1:
+ pmovmskb size, xmm4 ; size = 0; size |= ((b4[i] >> 7) << i);
+ pmovmskb index, xmm5 ; index = 0; index |= ((b5[i] >> 7) << i);
+ shl size, 16 ; size <<= 16;
+ or index, size ; index |= size;
+ not index ; index = ~index;
+ lea nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
+ ; nbits = t + 1 + 64;
+ and nbits, -DCTSIZE2 * SIZEOF_WORD ; nbits &= -128; /* now points to &t_[64] */
+ sub nbits, t ; nbits -= t;
+ shr nbits, 1 ; nbits >>= 1; /* # of leading 0 bits in old index + 33 */
+ tzcnt size, index ; size = # of trailing 0 bits in index
+ inc size ; ++size;
+ test index, index ; if (index == 0)
+ jz .ELOOP2 ; goto .ELOOP2;
+; NOTE: size == 32 cannot happen, since the last element is always 0.
+ shr index, sizeb ; index >>= size;
+ lea size, [size + nbits - 33] ; size = size + nbits - 33;
+ lea t, [t + size * SIZEOF_WORD] ; t += size;
+ cmp size, 16 ; if (size <= 16)
+ jle .ERLOOP2 ; goto .ERLOOP2;
+.BRLOOP2: ; do {
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+ ; nbits = actbl->ehufsi[0xf0];
+ sub size, 16 ; size -= 16;
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+ ; code = actbl->ehufco[0xf0];
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_BRLOOP2 ; insert code and flush put_buffer
+ movd mm_nbits, nbits ; else { nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+ cmp size, 16 ; if (size <= 16)
+ jle .ERLOOP2 ; goto .ERLOOP2;
+ jmp .BRLOOP2 ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.BLOOP2: ; do { /* size = # of zero bits/elements to skip */
+ shr index, sizeb ; index >>= size;
+ lea t, [t + size * SIZEOF_WORD] ; t += size;
+ cmp size, 16 ; if (size > 16)
+ jg .BRLOOP2 ; goto .BRLOOP2;
+.ERLOOP2: ; .ERLOOP2:
+ movsx nbits, word [t] ; nbits = *t;
+ add size, size ; size += size;
+ movd mm_temp, nbits ; temp = nbits;
+ movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
+ movd mm_nbits, nbits ; nbits --> MMX register
+ lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+ ; code = actbl->ehufco[size-16];
+ movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+ ; size = actbl->ehufsi[size-16];
+ psllq mm_code, mm_nbits ; code <<= nbits;
+ pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
+ lea nbits, [nbits + size] ; nbits += size;
+ por mm_code, mm_temp ; code |= temp;
+ xor size, size ; size = 0; /* kill tzcnt input dependency */
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_ERLOOP2 ; insert code, flush buffer, init size, goto .BLOOP2
+ tzcnt size, index ; size = # of trailing 0 bits in index
+ movd mm_nbits, nbits ; nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ inc size ; ++size;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+ test index, index
+ jnz .BLOOP2 ; } while (index != 0);
+.ELOOP2: ; .ELOOP2:
+ mov nbits, t ; nbits = t;
+ lea t, [t + SIZEOF_WORD] ; t = &t[1];
+ and nbits, DCTSIZE2 * SIZEOF_WORD - 1 ; nbits &= 127;
+ and t, -DCTSIZE2 * SIZEOF_WORD ; t &= -128; /* t = &t_[0]; */
+ cmp nbits, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (nbits != 62 * 2)
+ je .EFN ; {
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
+ ; code = actbl->ehufco[0];
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+ ; nbits = actbl->ehufsi[0];
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jg .EFN_SKIP_EMIT_CODE ; {
+ EMIT_QWORD size, sizeb, sizeh, , , , , , .EFN ; insert code, flush put_buffer
+ align 16
+.EFN_SKIP_EMIT_CODE: ; } else {
+ movd mm_nbits, nbits ; nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+.EFN: ; } }
+%define frame esp
+ mov frame, [t + save_frame]
+%define state ecx
+ mov state, [frame + arg_state]
+ movq [state + working_state.cur.put_buffer.simd], mm_put_buffer
+ ; state->cur.put_buffer.simd = put_buffer;
+ emms
+ mov [state + working_state.cur.free_bits], free_bits
+ ; state->cur.free_bits = free_bits;
+ POP edi
+ POP esi
+ POP ebp
+ POP ebx
+ ret
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_BRLOOP1:
+ EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , , , \
+ .ERLOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_ERLOOP1:
+ EMIT_QWORD size, sizeb, sizeh, \
+ { xor size, size }, \
+ { tzcnt size, index }, \
+ { inc size }, \
+ { test index, index }, \
+ { jnz .BLOOP1 }, \
+ .ELOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_BRLOOP2:
+ EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , \
+ { cmp size, 16 }, \
+ { jle .ERLOOP2 }, \
+ .BRLOOP2
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_ERLOOP2:
+ EMIT_QWORD size, sizeb, sizeh, \
+ { xor size, size }, \
+ { tzcnt size, index }, \
+ { inc size }, \
+ { test index, index }, \
+ { jnz .BLOOP2 }, \
+ .ELOOP2
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jcphuff-sse2.asm b/media/libjpeg/simd/i386/jcphuff-sse2.asm
new file mode 100644
index 0000000000..c26b48a47d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcphuff-sse2.asm
@@ -0,0 +1,662 @@
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding. See jcphuff.c for more details.
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+ pxor N0, N0
+ pxor N1, N1
+
+ mov T0, INT [LUT + 0*SIZEOF_INT]
+ mov T1, INT [LUT + 8*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+ pinsrw X1, word [BLOCK + T1 * 2], 0
+
+ mov T0, INT [LUT + 1*SIZEOF_INT]
+ mov T1, INT [LUT + 9*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+ pinsrw X1, word [BLOCK + T1 * 2], 1
+
+ mov T0, INT [LUT + 2*SIZEOF_INT]
+ mov T1, INT [LUT + 10*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+ pinsrw X1, word [BLOCK + T1 * 2], 2
+
+ mov T0, INT [LUT + 3*SIZEOF_INT]
+ mov T1, INT [LUT + 11*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+ pinsrw X1, word [BLOCK + T1 * 2], 3
+
+ mov T0, INT [LUT + 4*SIZEOF_INT]
+ mov T1, INT [LUT + 12*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+ pinsrw X1, word [BLOCK + T1 * 2], 4
+
+ mov T0, INT [LUT + 5*SIZEOF_INT]
+ mov T1, INT [LUT + 13*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+ pinsrw X1, word [BLOCK + T1 * 2], 5
+
+ mov T0, INT [LUT + 6*SIZEOF_INT]
+ mov T1, INT [LUT + 14*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+ pinsrw X1, word [BLOCK + T1 * 2], 6
+
+ mov T0, INT [LUT + 7*SIZEOF_INT]
+ mov T1, INT [LUT + 15*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+ pinsrw X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+ pxor N0, N0
+ pxor N1, N1
+ pxor X1, X1
+
+ mov T0, INT [LUT + 0*SIZEOF_INT]
+ mov T1, INT [LUT + 8*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+ pinsrw X1, word [BLOCK + T1 * 2], 0
+
+ mov T0, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+
+ mov T0, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+
+ mov T0, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+
+ mov T0, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+
+ mov T0, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+
+ mov T0, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+
+ mov T0, INT [LUT + 7*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+
+ cmp LENEND, 2
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 9*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 1
+
+ cmp LENEND, 3
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 10*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 2
+
+ cmp LENEND, 4
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 11*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 3
+
+ cmp LENEND, 5
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 12*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 4
+
+ cmp LENEND, 6
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 13*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 5
+
+ cmp LENEND, 7
+ jl %%.ELOAD15
+ mov T1, INT [LUT + 14*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+ pxor N0, N0
+
+ mov T0, INT [LUT + 0*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+
+ mov T0, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+
+ mov T0, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+
+ mov T0, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+
+ mov T0, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+
+ mov T0, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+
+ mov T0, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+
+ mov T0, INT [LUT + 7*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+ pxor N0, N0
+ pxor X0, X0
+
+ mov T1, INT [LUT + 0*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 0
+
+ cmp LENEND, 2
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 1
+
+ cmp LENEND, 3
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 2
+
+ cmp LENEND, 4
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 3
+
+ cmp LENEND, 5
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 4
+
+ cmp LENEND, 6
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 5
+
+ cmp LENEND, 7
+ jl %%.ELOAD7
+ mov T1, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+ movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
+ movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
+ movdqa xmm2, XMMWORD [VALUES + (16*2)]
+ movdqa xmm3, XMMWORD [VALUES + (24*2)]
+ movdqa xmm4, XMMWORD [VALUES + (32*2)]
+ movdqa xmm5, XMMWORD [VALUES + (40*2)]
+ movdqa xmm6, XMMWORD [VALUES + (48*2)]
+
+ pcmpeqw xmm0, ZERO
+ pcmpeqw xmm1, ZERO
+ pcmpeqw xmm2, ZERO
+ pcmpeqw xmm3, ZERO
+ pcmpeqw xmm4, ZERO
+ pcmpeqw xmm5, ZERO
+ pcmpeqw xmm6, ZERO
+ pcmpeqw xmm7, XMMWORD [VALUES + (56*2)]
+
+ packsswb xmm0, xmm1
+ packsswb xmm2, xmm3
+ packsswb xmm4, xmm5
+ packsswb xmm6, xmm7
+
+ pmovmskb eax, xmm0
+ pmovmskb ecx, xmm2
+ pmovmskb edx, xmm4
+ pmovmskb esi, xmm6
+
+ shl ecx, 16
+ shl esi, 16
+
+ or eax, ecx
+ or edx, esi
+
+ not eax
+ not edx
+
+ mov edi, ZEROBITS
+
+ mov INT [edi], eax
+ mov INT [edi+SIZEOF_INT], edx
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+; const int *jpeg_natural_order_start,
+; int Sl, int Al, JCOEF *values,
+; size_t *zerobits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *zerobits
+
+%define ZERO xmm7
+%define X0 xmm0
+%define X1 xmm1
+%define N0 xmm2
+%define N1 xmm3
+%define AL xmm4
+%define K eax
+%define LENEND eax
+%define LUT ebx
+%define T0 ecx
+%define T1 edx
+%define BLOCK esi
+%define VALUES edi
+%define LEN ebp
+
+%define ZEROBITS INT [esp + 5 * 4]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ sub esp, 4
+ push ebx
+ push ecx
+; push edx ; need not be preserved
+ push esi
+ push edi
+ push ebp
+
+ mov BLOCK, INT [eax + 8]
+ mov LUT, INT [eax + 12]
+ mov VALUES, INT [eax + 24]
+ movd AL, INT [eax + 20]
+ mov T0, INT [eax + 28]
+ mov ZEROBITS, T0
+ mov LEN, INT [eax + 16]
+ pxor ZERO, ZERO
+ mov K, LEN
+ and K, -16
+ shr K, 4
+ jz .ELOOP16
+.BLOOP16:
+ LOAD16
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ pxor N0, X0
+ pxor N1, X1
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+ add VALUES, 16*2
+ add LUT, 16*SIZEOF_INT
+ dec K
+ jnz .BLOOP16
+ test LEN, 15
+ je .PADDING
+.ELOOP16:
+ mov LENEND, LEN
+ and LENEND, 7
+
+ test LEN, 8
+ jz .TRY7
+ test LEN, 7
+ jz .TRY8
+
+ LOAD15
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ pxor N0, X0
+ pxor N1, X1
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+ add VALUES, 16*2
+ jmp .PADDING
+.TRY8:
+ LOAD8
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ pxor N0, X0
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ add VALUES, 8*2
+ jmp .PADDING
+.TRY7:
+ LOAD7
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ pxor N0, X0
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ add VALUES, 8*2
+.PADDING:
+ mov K, LEN
+ add K, 7
+ and K, -8
+ shr K, 3
+ sub K, DCTSIZE2/8
+ jz .EPADDING
+ align 16
+.ZEROLOOP:
+ movdqa XMMWORD [VALUES + 0], ZERO
+ add VALUES, 8*2
+ inc K
+ jnz .ZEROLOOP
+.EPADDING:
+ sub VALUES, DCTSIZE2*2
+
+ REDUCE0
+
+ pop ebp
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+ pop ecx
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+; const int *jpeg_natural_order_start,
+; int Sl, int Al, JCOEF *absvalues,
+; size_t *bits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *bits
+
+%define ZERO xmm7
+%define ONE xmm5
+%define X0 xmm0
+%define X1 xmm1
+%define N0 xmm2
+%define N1 xmm3
+%define AL xmm4
+%define K eax
+%define LENEND eax
+%define LUT ebx
+%define T0 ecx
+%define T0w cx
+%define T1 edx
+%define BLOCK esi
+%define VALUES edi
+%define KK ebp
+
+%define ZEROBITS INT [esp + 5 * 4]
+%define EOB INT [esp + 5 * 4 + 4]
+%define LEN INT [esp + 5 * 4 + 8]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ sub esp, 16
+ push ebx
+ push ecx
+; push edx ; need not be preserved
+ push esi
+ push edi
+ push ebp
+
+ pcmpeqw ONE, ONE
+ psrlw ONE, 15
+ mov BLOCK, INT [eax + 8]
+ mov LUT, INT [eax + 12]
+ mov VALUES, INT [eax + 24]
+ movd AL, INT [eax + 20]
+ mov T0, INT [eax + 28]
+ mov K, INT [eax + 16]
+ mov INT [T0 + 2 * SIZEOF_INT], -1
+ mov INT [T0 + 3 * SIZEOF_INT], -1
+ mov ZEROBITS, T0
+ mov LEN, K
+ pxor ZERO, ZERO
+ and K, -16
+ mov EOB, 0
+ xor KK, KK
+ shr K, 4
+ jz .ELOOPR16
+.BLOOPR16:
+ LOAD16
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ pcmpeqw X0, ONE
+ pcmpeqw X1, ONE
+ packsswb N0, N1
+ packsswb X0, X1
+ pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ mov T1, ZEROBITS
+ not T0
+ mov word [T1 + 2 * SIZEOF_INT + KK], T0w
+ pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
+ bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER16 ; if (idx) {
+ lea T1, [T1+KK*8]
+ mov EOB, T1 ; EOB = k + idx;
+.CONTINUER16:
+ add VALUES, 16*2
+ add LUT, 16*SIZEOF_INT
+ add KK, 2
+ dec K
+ jnz .BLOOPR16
+ test LEN, 15
+ je .PADDINGR
+.ELOOPR16:
+ mov LENEND, LEN
+
+ test LENEND, 8
+ jz .TRYR7
+ test LENEND, 7
+ jz .TRYR8
+
+ and LENEND, 7
+ LOAD15
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ pcmpeqw X0, ONE
+ pcmpeqw X1, ONE
+ packsswb N0, N1
+ packsswb X0, X1
+ pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ mov T1, ZEROBITS
+ not T0
+ mov word [T1 + 2 * SIZEOF_INT + KK], T0w
+ pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
+ bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER15 ; if (idx) {
+ lea T1, [T1+KK*8]
+ mov EOB, T1 ; EOB = k + idx;
+.CONTINUER15:
+ add VALUES, 16*2
+ jmp .PADDINGR
+.TRYR8:
+ LOAD8
+
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ pcmpeqw X0, ONE
+ packsswb N0, ZERO
+ packsswb X0, ZERO
+ pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ mov T1, ZEROBITS
+ not T0
+ mov word [T1 + 2 * SIZEOF_INT + KK], T0w
+ pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
+ bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER8 ; if (idx) {
+ lea T1, [T1+KK*8]
+ mov EOB, T1 ; EOB = k + idx;
+.CONTINUER8:
+ add VALUES, 8*2
+ jmp .PADDINGR
+.TRYR7:
+ and LENEND, 7
+ LOAD7
+
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ pcmpeqw X0, ONE
+ packsswb N0, ZERO
+ packsswb X0, ZERO
+ pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ mov T1, ZEROBITS
+ not T0
+ mov word [T1 + 2 * SIZEOF_INT + KK], T0w
+ pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
+ bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER7 ; if (idx) {
+ lea T1, [T1+KK*8]
+ mov EOB, T1 ; EOB = k + idx;
+.CONTINUER7:
+ add VALUES, 8*2
+.PADDINGR:
+ mov K, LEN
+ add K, 7
+ and K, -8
+ shr K, 3
+ sub K, DCTSIZE2/8
+ jz .EPADDINGR
+ align 16
+.ZEROLOOPR:
+ movdqa XMMWORD [VALUES + 0], ZERO
+ add VALUES, 8*2
+ inc K
+ jnz .ZEROLOOPR
+.EPADDINGR:
+ sub VALUES, DCTSIZE2*2
+
+ REDUCE0
+
+ mov eax, EOB
+
+ pop ebp
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+ pop ecx
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jcsample-avx2.asm b/media/libjpeg/simd/i386/jcsample-avx2.asm
new file mode 100644
index 0000000000..0a20802dd8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-avx2.asm
@@ -0,0 +1,388 @@
+;
+; jcsample.asm - downsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION image_width
+%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
+%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
+%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
+%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov edx, 0x00010000 ; bias pattern
+ vmovd xmm7, edx
+ vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+ vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
+ vpcmpeqw ymm6, ymm6, ymm6
+ vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae short .columnloop
+ alignx 16, 7
+
+.columnloop_r24:
+ ; ecx can possibly be 8, 16, 24
+ cmp ecx, 24
+ jne .columnloop_r16
+ vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
+ mov ecx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r16:
+ cmp ecx, 16
+ jne .columnloop_r8
+ vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vpxor ymm1, ymm1, ymm1
+ mov ecx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r8:
+ vmovdqu xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
+ vpxor ymm1, ymm1, ymm1
+ mov ecx, SIZEOF_YMMWORD
+ jmp short .downsample
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+ vpsrlw ymm2, ymm0, BYTE_BIT
+ vpand ymm0, ymm0, ymm6
+ vpsrlw ymm3, ymm1, BYTE_BIT
+ vpand ymm1, ymm1, ymm6
+
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm1, ymm1, ymm7
+ vpsrlw ymm0, ymm0, 1
+ vpsrlw ymm1, ymm1, 1
+
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8
+
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+ sub ecx, byte SIZEOF_YMMWORD ; outcol
+ add esi, byte 2*SIZEOF_YMMWORD ; inptr
+ add edi, byte 1*SIZEOF_YMMWORD ; outptr
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae short .columnloop
+ test ecx, ecx
+ jnz near .columnloop_r24
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION image_width
+%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
+%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
+%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
+%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov edx, 0x00020001 ; bias pattern
+ vmovd xmm7, edx
+ vpcmpeqw ymm6, ymm6, ymm6
+ vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+ vperm2i128 ymm7, ymm7, ymm7, 0
+ vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov edi, JSAMPROW [edi] ; outptr
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae short .columnloop
+ alignx 16, 7
+
+.columnloop_r24:
+ cmp ecx, 24
+ jne .columnloop_r16
+ vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
+ vmovdqu xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
+ mov ecx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r16:
+ cmp ecx, 16
+ jne .columnloop_r8
+ vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm3, ymm3, ymm3
+ mov ecx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r8:
+ vmovdqu xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ vmovdqu xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm3, ymm3, ymm3
+ mov ecx, SIZEOF_YMMWORD
+ jmp short .downsample
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ vmovdqu ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
+ vmovdqu ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+ vpand ymm4, ymm0, ymm6
+ vpsrlw ymm0, ymm0, BYTE_BIT
+ vpand ymm5, ymm1, ymm6
+ vpsrlw ymm1, ymm1, BYTE_BIT
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+
+ vpand ymm4, ymm2, ymm6
+ vpsrlw ymm2, ymm2, BYTE_BIT
+ vpand ymm5, ymm3, ymm6
+ vpsrlw ymm3, ymm3, BYTE_BIT
+ vpaddw ymm2, ymm2, ymm4
+ vpaddw ymm3, ymm3, ymm5
+
+ vpaddw ymm0, ymm0, ymm1
+ vpaddw ymm2, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm2, ymm2, ymm7
+ vpsrlw ymm0, ymm0, 2
+ vpsrlw ymm2, ymm2, 2
+
+ vpackuswb ymm0, ymm0, ymm2
+ vpermq ymm0, ymm0, 0xd8
+
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+ sub ecx, byte SIZEOF_YMMWORD ; outcol
+ add edx, byte 2*SIZEOF_YMMWORD ; inptr0
+ add esi, byte 2*SIZEOF_YMMWORD ; inptr1
+ add edi, byte 1*SIZEOF_YMMWORD ; outptr
+ cmp ecx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .columnloop_r24
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jcsample-mmx.asm b/media/libjpeg/simd/i386/jcsample-mmx.asm
new file mode 100644
index 0000000000..2c223eebe8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-mmx.asm
@@ -0,0 +1,324 @@
+;
+; jcsample.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION image_width
+%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
+%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
+%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
+%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov edx, 0x00010000 ; bias pattern
+ movd mm7, edx
+ pcmpeqw mm6, mm6
+ punpckldq mm7, mm7 ; mm7={0, 1, 0, 1}
+ psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mm2, mm0
+ movq mm3, mm1
+
+ pand mm0, mm6
+ psrlw mm2, BYTE_BIT
+ pand mm1, mm6
+ psrlw mm3, BYTE_BIT
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+ paddw mm0, mm7
+ paddw mm1, mm7
+ psrlw mm0, 1
+ psrlw mm1, 1
+
+ packuswb mm0, mm1
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+ add esi, byte 2*SIZEOF_MMWORD ; inptr
+ add edi, byte 1*SIZEOF_MMWORD ; outptr
+ sub ecx, byte SIZEOF_MMWORD ; outcol
+ jnz short .columnloop
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg short .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION image_width
+%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
+%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
+%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
+%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov edx, 0x00020001 ; bias pattern
+ movd mm7, edx
+ pcmpeqw mm6, mm6
+ punpckldq mm7, mm7 ; mm7={1, 2, 1, 2}
+ psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov edi, JSAMPROW [edi] ; outptr
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+ movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+ movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+ movq mm4, mm0
+ movq mm5, mm1
+ pand mm0, mm6
+ psrlw mm4, BYTE_BIT
+ pand mm1, mm6
+ psrlw mm5, BYTE_BIT
+ paddw mm0, mm4
+ paddw mm1, mm5
+
+ movq mm4, mm2
+ movq mm5, mm3
+ pand mm2, mm6
+ psrlw mm4, BYTE_BIT
+ pand mm3, mm6
+ psrlw mm5, BYTE_BIT
+ paddw mm2, mm4
+ paddw mm3, mm5
+
+ paddw mm0, mm1
+ paddw mm2, mm3
+ paddw mm0, mm7
+ paddw mm2, mm7
+ psrlw mm0, 2
+ psrlw mm2, 2
+
+ packuswb mm0, mm2
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+ add edx, byte 2*SIZEOF_MMWORD ; inptr0
+ add esi, byte 2*SIZEOF_MMWORD ; inptr1
+ add edi, byte 1*SIZEOF_MMWORD ; outptr
+ sub ecx, byte SIZEOF_MMWORD ; outcol
+ jnz near .columnloop
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jcsample-sse2.asm b/media/libjpeg/simd/i386/jcsample-sse2.asm
new file mode 100644
index 0000000000..4fea60d2e2
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-sse2.asm
@@ -0,0 +1,351 @@
+;
+; jcsample.asm - downsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION image_width
+%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
+%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
+%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
+%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov edx, 0x00010000 ; bias pattern
+ movd xmm7, edx
+ pcmpeqw xmm6, xmm6
+ pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ alignx 16, 7
+
+.columnloop_r8:
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ pxor xmm1, xmm1
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .downsample
+ alignx 16, 7
+
+.columnloop:
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ pand xmm0, xmm6
+ psrlw xmm2, BYTE_BIT
+ pand xmm1, xmm6
+ psrlw xmm3, BYTE_BIT
+
+ paddw xmm0, xmm2
+ paddw xmm1, xmm3
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+ psrlw xmm0, 1
+ psrlw xmm1, 1
+
+ packuswb xmm0, xmm1
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+ sub ecx, byte SIZEOF_XMMWORD ; outcol
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr
+ add edi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ test ecx, ecx
+ jnz short .columnloop_r8
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b) + 8 ; JDIMENSION image_width
+%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
+%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
+%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
+%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov edx, 0x00020001 ; bias pattern
+ movd xmm7, edx
+ pcmpeqw xmm6, xmm6
+ pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov edi, JSAMPROW [edi] ; outptr
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ alignx 16, 7
+
+.columnloop_r8:
+ movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ pxor xmm2, xmm2
+ pxor xmm3, xmm3
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .downsample
+ alignx 16, 7
+
+.columnloop:
+ movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ pand xmm0, xmm6
+ psrlw xmm4, BYTE_BIT
+ pand xmm1, xmm6
+ psrlw xmm5, BYTE_BIT
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+ pand xmm2, xmm6
+ psrlw xmm4, BYTE_BIT
+ pand xmm3, xmm6
+ psrlw xmm5, BYTE_BIT
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+ paddw xmm0, xmm1
+ paddw xmm2, xmm3
+ paddw xmm0, xmm7
+ paddw xmm2, xmm7
+ psrlw xmm0, 2
+ psrlw xmm2, 2
+
+ packuswb xmm0, xmm2
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+ sub ecx, byte SIZEOF_XMMWORD ; outcol
+ add edx, byte 2*SIZEOF_XMMWORD ; inptr0
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr1
+ add edi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .columnloop_r8
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdcolext-avx2.asm b/media/libjpeg/simd/i386/jdcolext-avx2.asm
new file mode 100644
index 0000000000..015be0416c
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-avx2.asm
@@ -0,0 +1,515 @@
+;
+; jdcolext.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+; JDIMENSION input_row, JSAMPARRAY output_buf,
+; int num_rows)
+;
+
+%define out_width(b) (b) + 8 ; JDIMENSION out_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define input_row(b) (b) + 16 ; JDIMENSION input_row
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+ ; ymmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [out_width(eax)] ; num_cols
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [input_row(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ push eax
+ push edi
+ push edx
+ push ebx
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr0
+ mov ebx, JSAMPROW [ebx] ; inptr1
+ mov edx, JSAMPROW [edx] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ alignx 16, 7
+.columnloop:
+
+ vmovdqu ymm5, YMMWORD [ebx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+ vmovdqu ymm1, YMMWORD [edx] ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpcmpeqw ymm0, ymm0, ymm0
+ vpcmpeqw ymm7, ymm7, ymm7
+ vpsrlw ymm0, ymm0, BYTE_BIT ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+ vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpand ymm4, ymm0, ymm5 ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+ vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+ vpand ymm0, ymm0, ymm1 ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+ vpsrlw ymm1, ymm1, BYTE_BIT ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+ vpaddw ymm2, ymm4, ymm7
+ vpaddw ymm3, ymm5, ymm7
+ vpaddw ymm6, ymm0, ymm7
+ vpaddw ymm7, ymm1, ymm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbE
+ vpaddw ymm5, ymm3, ymm3 ; ymm5=2*CbO
+ vpaddw ymm0, ymm6, ymm6 ; ymm0=2*CrE
+ vpaddw ymm1, ymm7, ymm7 ; ymm1=2*CrO
+
+ vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbE * -FIX(0.22800))
+ vpmulhw ymm5, ymm5, [GOTOFF(eax,PW_MF0228)] ; ymm5=(2*CbO * -FIX(0.22800))
+ vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrE * FIX(0.40200))
+ vpmulhw ymm1, ymm1, [GOTOFF(eax,PW_F0402)] ; ymm1=(2*CrO * FIX(0.40200))
+
+ vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+ vpaddw ymm5, ymm5, [GOTOFF(eax,PW_ONE)]
+ vpsraw ymm4, ymm4, 1 ; ymm4=(CbE * -FIX(0.22800))
+ vpsraw ymm5, ymm5, 1 ; ymm5=(CbO * -FIX(0.22800))
+ vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+ vpaddw ymm1, ymm1, [GOTOFF(eax,PW_ONE)]
+ vpsraw ymm0, ymm0, 1 ; ymm0=(CrE * FIX(0.40200))
+ vpsraw ymm1, ymm1, 1 ; ymm1=(CrO * FIX(0.40200))
+
+ vpaddw ymm4, ymm4, ymm2
+ vpaddw ymm5, ymm5, ymm3
+ vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+ vpaddw ymm5, ymm5, ymm3 ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+ vpaddw ymm0, ymm0, ymm6 ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+ vpaddw ymm1, ymm1, ymm7 ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ vmovdqa YMMWORD [wk(0)], ymm4 ; wk(0)=(B-Y)E
+ vmovdqa YMMWORD [wk(1)], ymm5 ; wk(1)=(B-Y)O
+
+ vpunpckhwd ymm4, ymm2, ymm6
+ vpunpcklwd ymm2, ymm2, ymm6
+ vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF0344_F0285)]
+ vpunpckhwd ymm5, ymm3, ymm7
+ vpunpcklwd ymm3, ymm3, ymm7
+ vpmaddwd ymm3, ymm3, [GOTOFF(eax,PW_MF0344_F0285)]
+ vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+ vpaddd ymm4, ymm4, [GOTOFF(eax,PD_ONEHALF)]
+ vpsrad ymm2, ymm2, SCALEBITS
+ vpsrad ymm4, ymm4, SCALEBITS
+ vpaddd ymm3, ymm3, [GOTOFF(eax,PD_ONEHALF)]
+ vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+ vpsrad ymm3, ymm3, SCALEBITS
+ vpsrad ymm5, ymm5, SCALEBITS
+
+ vpackssdw ymm2, ymm2, ymm4 ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ vpackssdw ymm3, ymm3, ymm5 ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ vpsubw ymm2, ymm2, ymm6 ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ vpsubw ymm3, ymm3, ymm7 ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ vmovdqu ymm5, YMMWORD [esi] ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpcmpeqw ymm4, ymm4, ymm4
+ vpsrlw ymm4, ymm4, BYTE_BIT ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+ vpand ymm4, ymm4, ymm5 ; ymm4=Y(02468ACEGIKMOQSU)=YE
+ vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+ vpaddw ymm0, ymm0, ymm4 ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+ vpaddw ymm1, ymm1, ymm5 ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+ vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
+ vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm2, ymm2, ymm4 ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+ vpaddw ymm3, ymm3, ymm5 ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+ vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
+ vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm4, ymm4, YMMWORD [wk(0)] ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+ vpaddw ymm5, ymm5, YMMWORD [wk(1)] ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+ vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
+ vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+ ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+ ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+ ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+ vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+ ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+ vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+ ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+ ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+ vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+ ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+ vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+ ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+ vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+ ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+ vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+ ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+ vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+ ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+ vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+ ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+ vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+ ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+ vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+ vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+ vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+ ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+ vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+ ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+ vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+ ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+ vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+ ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+ vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+ vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+ vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test edi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub ecx, byte SIZEOF_YMMWORD
+ jz near .nextrow
+
+ add esi, byte SIZEOF_YMMWORD ; inptr0
+ add ebx, byte SIZEOF_YMMWORD ; inptr1
+ add edx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st64:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_YMMWORD
+ jb short .column_st32
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmF
+ sub ecx, byte 2*SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st32:
+ cmp ecx, byte SIZEOF_YMMWORD
+ jb short .column_st31
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ add edi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub ecx, byte SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st31:
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ sub ecx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ vmovq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_MMWORD
+ sub ecx, byte SIZEOF_MMWORD
+ vpsrldq xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st3
+ vmovd XMM_DWORD [edi], xmmA
+ add edi, byte SIZEOF_DWORD
+ sub ecx, byte SIZEOF_DWORD
+ vpsrldq xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of eax to the output when it has enough
+ ; space.
+ vmovd eax, xmmA
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [edi], ax
+ add edi, byte SIZEOF_WORD
+ sub ecx, byte SIZEOF_WORD
+ shr eax, 16
+.column_st1:
+ ; Store the lower 1 byte of eax to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .nextrow
+ mov byte [edi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+ vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+ ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+ ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+ vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+ ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+ ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+ vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+ ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+ ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+ vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+ ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+ vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+ ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+ vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+ vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+ vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test edi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+ vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+ vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+ add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub ecx, byte SIZEOF_YMMWORD
+ jz near .nextrow
+
+ add esi, byte SIZEOF_YMMWORD ; inptr0
+ add ebx, byte SIZEOF_YMMWORD ; inptr1
+ add edx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st64:
+ cmp ecx, byte SIZEOF_YMMWORD/2
+ jb short .column_st32
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmC
+ vmovdqa ymmD, ymmH
+ sub ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+ cmp ecx, byte SIZEOF_YMMWORD/4
+ jb short .column_st16
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ add edi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+ cmp ecx, byte SIZEOF_YMMWORD/8
+ jb short .column_st15
+ vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+ ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_YMMWORD/16
+ jb short .column_st7
+ vmovq MMWORD [edi], xmmA
+ add edi, byte SIZEOF_YMMWORD/16*4
+ sub ecx, byte SIZEOF_YMMWORD/16
+ vpsrldq xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+ ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .nextrow
+ vmovd XMM_DWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ alignx 16, 7
+
+.nextrow:
+ pop ecx
+ pop esi
+ pop ebx
+ pop edx
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ add edi, byte SIZEOF_JSAMPROW ; output_buf
+ dec eax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdcolext-mmx.asm b/media/libjpeg/simd/i386/jdcolext-mmx.asm
new file mode 100644
index 0000000000..5813cfcb66
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-mmx.asm
@@ -0,0 +1,404 @@
+;
+; jdcolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf,
+; JDIMENSION input_row, JSAMPARRAY output_buf,
+; int num_rows)
+;
+
+%define out_width(b) (b) + 8 ; JDIMENSION out_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define input_row(b) (b) + 16 ; JDIMENSION input_row
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [out_width(eax)] ; num_cols
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [input_row(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ push eax
+ push edi
+ push edx
+ push ebx
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr0
+ mov ebx, JSAMPROW [ebx] ; inptr1
+ mov edx, JSAMPROW [edx] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ alignx 16, 7
+.columnloop:
+
+ movq mm5, MMWORD [ebx] ; mm5=Cb(01234567)
+ movq mm1, MMWORD [edx] ; mm1=Cr(01234567)
+
+ pcmpeqw mm4, mm4
+ pcmpeqw mm7, mm7
+ psrlw mm4, BYTE_BIT
+ psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+ movq mm0, mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand mm4, mm5 ; mm4=Cb(0246)=CbE
+ psrlw mm5, BYTE_BIT ; mm5=Cb(1357)=CbO
+ pand mm0, mm1 ; mm0=Cr(0246)=CrE
+ psrlw mm1, BYTE_BIT ; mm1=Cr(1357)=CrO
+
+ paddw mm4, mm7
+ paddw mm5, mm7
+ paddw mm0, mm7
+ paddw mm1, mm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movq mm2, mm4 ; mm2=CbE
+ movq mm3, mm5 ; mm3=CbO
+ paddw mm4, mm4 ; mm4=2*CbE
+ paddw mm5, mm5 ; mm5=2*CbO
+ movq mm6, mm0 ; mm6=CrE
+ movq mm7, mm1 ; mm7=CrO
+ paddw mm0, mm0 ; mm0=2*CrE
+ paddw mm1, mm1 ; mm1=2*CrO
+
+ pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800))
+ pmulhw mm5, [GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800))
+ pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200))
+ pmulhw mm1, [GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200))
+
+ paddw mm4, [GOTOFF(eax,PW_ONE)]
+ paddw mm5, [GOTOFF(eax,PW_ONE)]
+ psraw mm4, 1 ; mm4=(CbE * -FIX(0.22800))
+ psraw mm5, 1 ; mm5=(CbO * -FIX(0.22800))
+ paddw mm0, [GOTOFF(eax,PW_ONE)]
+ paddw mm1, [GOTOFF(eax,PW_ONE)]
+ psraw mm0, 1 ; mm0=(CrE * FIX(0.40200))
+ psraw mm1, 1 ; mm1=(CrO * FIX(0.40200))
+
+ paddw mm4, mm2
+ paddw mm5, mm3
+ paddw mm4, mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw mm5, mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw mm0, mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw mm1, mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E
+ movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O
+
+ movq mm4, mm2
+ movq mm5, mm3
+ punpcklwd mm2, mm6
+ punpckhwd mm4, mm6
+ pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm4, [GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd mm3, mm7
+ punpckhwd mm5, mm7
+ pmaddwd mm3, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd mm2, [GOTOFF(eax,PD_ONEHALF)]
+ paddd mm4, [GOTOFF(eax,PD_ONEHALF)]
+ psrad mm2, SCALEBITS
+ psrad mm4, SCALEBITS
+ paddd mm3, [GOTOFF(eax,PD_ONEHALF)]
+ paddd mm5, [GOTOFF(eax,PD_ONEHALF)]
+ psrad mm3, SCALEBITS
+ psrad mm5, SCALEBITS
+
+ packssdw mm2, mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw mm3, mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw mm2, mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw mm3, mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movq mm5, MMWORD [esi] ; mm5=Y(01234567)
+
+ pcmpeqw mm4, mm4
+ psrlw mm4, BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..}
+ pand mm4, mm5 ; mm4=Y(0246)=YE
+ psrlw mm5, BYTE_BIT ; mm5=Y(1357)=YO
+
+ paddw mm0, mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+ paddw mm1, mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+ packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
+ packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+ paddw mm2, mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+ paddw mm3, mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+ packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
+ packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+ paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+ paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+ packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
+ packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+ punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07)
+ punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27)
+
+ movq mmG, mmA
+ movq mmH, mmA
+ punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03)
+ punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07)
+
+ psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
+ psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
+
+ movq mmC, mmD
+ movq mmB, mmD
+ punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14)
+ punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --)
+
+ psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
+
+ movq mmF, mmE
+ punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25)
+ punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --)
+
+ punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12)
+ punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05)
+ punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st16:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_MMWORD
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq mmA, mmC
+ sub ecx, byte 2*SIZEOF_MMWORD
+ add edi, byte 2*SIZEOF_MMWORD
+ jmp short .column_st4
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA, mmE
+ sub ecx, byte SIZEOF_MMWORD
+ add edi, byte SIZEOF_MMWORD
+.column_st4:
+ movd eax, mmA
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st2
+ mov dword [edi+0*SIZEOF_DWORD], eax
+ psrlq mmA, DWORD_BIT
+ movd eax, mmA
+ sub ecx, byte SIZEOF_DWORD
+ add edi, byte SIZEOF_DWORD
+.column_st2:
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [edi+0*SIZEOF_WORD], ax
+ shr eax, WORD_BIT
+ sub ecx, byte SIZEOF_WORD
+ add edi, byte SIZEOF_WORD
+.column_st1:
+ cmp ecx, byte SIZEOF_BYTE
+ jb short .nextrow
+ mov byte [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+ pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+ punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36)
+ punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17)
+ punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37)
+
+ movq mmC, mmA
+ punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32)
+ punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36)
+ movq mmG, mmB
+ punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33)
+ punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37)
+
+ movq mmD, mmA
+ punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31)
+ punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33)
+ movq mmH, mmC
+ punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35)
+ punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st16:
+ cmp ecx, byte SIZEOF_MMWORD/2
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq mmA, mmC
+ movq mmD, mmH
+ sub ecx, byte SIZEOF_MMWORD/2
+ add edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD/4
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA, mmD
+ sub ecx, byte SIZEOF_MMWORD/4
+ add edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+ cmp ecx, byte SIZEOF_MMWORD/8
+ jb short .nextrow
+ movd dword [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ alignx 16, 7
+
+.nextrow:
+ pop ecx
+ pop esi
+ pop ebx
+ pop edx
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ add edi, byte SIZEOF_JSAMPROW ; output_buf
+ dec eax ; num_rows
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdcolext-sse2.asm b/media/libjpeg/simd/i386/jdcolext-sse2.asm
new file mode 100644
index 0000000000..d5572b3294
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-sse2.asm
@@ -0,0 +1,458 @@
+;
+; jdcolext.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+; JDIMENSION input_row, JSAMPARRAY output_buf,
+; int num_rows)
+;
+
+%define out_width(b) (b) + 8 ; JDIMENSION out_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define input_row(b) (b) + 16 ; JDIMENSION input_row
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+%define num_rows(b) (b) + 24 ; int num_rows
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [out_width(eax)] ; num_cols
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [input_row(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
+.rowloop:
+ push eax
+ push edi
+ push edx
+ push ebx
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr0
+ mov ebx, JSAMPROW [ebx] ; inptr1
+ mov edx, JSAMPROW [edx] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ alignx 16, 7
+.columnloop:
+
+ movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
+ movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
+
+ pcmpeqw xmm4, xmm4
+ pcmpeqw xmm7, xmm7
+ psrlw xmm4, BYTE_BIT
+ psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+ movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
+ psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
+ pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
+ psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
+
+ paddw xmm4, xmm7
+ paddw xmm5, xmm7
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm2, xmm4 ; xmm2=CbE
+ movdqa xmm3, xmm5 ; xmm3=CbO
+ paddw xmm4, xmm4 ; xmm4=2*CbE
+ paddw xmm5, xmm5 ; xmm5=2*CbO
+ movdqa xmm6, xmm0 ; xmm6=CrE
+ movdqa xmm7, xmm1 ; xmm7=CrO
+ paddw xmm0, xmm0 ; xmm0=2*CrE
+ paddw xmm1, xmm1 ; xmm1=2*CrO
+
+ pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
+ pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
+ pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
+ pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
+
+ paddw xmm4, [GOTOFF(eax,PW_ONE)]
+ paddw xmm5, [GOTOFF(eax,PW_ONE)]
+ psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
+ psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
+ paddw xmm0, [GOTOFF(eax,PW_ONE)]
+ paddw xmm1, [GOTOFF(eax,PW_ONE)]
+ psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
+ psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
+
+ paddw xmm4, xmm2
+ paddw xmm5, xmm3
+ paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
+
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+ punpcklwd xmm2, xmm6
+ punpckhwd xmm4, xmm6
+ pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd xmm3, xmm7
+ punpckhwd xmm5, xmm7
+ pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm4, [GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm2, SCALEBITS
+ psrad xmm4, SCALEBITS
+ paddd xmm3, [GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm3, SCALEBITS
+ psrad xmm5, SCALEBITS
+
+ packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm4, xmm4
+ psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
+ psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
+
+ paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+ paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+ packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+ paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+ packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+ paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+ packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG, xmmA
+ movdqa xmmH, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC, xmmD
+ movdqa xmmB, xmmD
+ punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF, xmmE
+ punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB, xmmE
+ punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB, xmmF
+ punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st32:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmF
+ sub ecx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub ecx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_MMWORD
+ sub ecx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd XMM_DWORD [edi], xmmA
+ add edi, byte SIZEOF_DWORD
+ sub ecx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of eax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [edi], ax
+ add edi, byte SIZEOF_WORD
+ sub ecx, byte SIZEOF_WORD
+ shr eax, 16
+.column_st1:
+ ; Store the lower 1 byte of eax to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .nextrow
+ mov byte [edi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG, xmmB
+ punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH, xmmC
+ punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st32:
+ cmp ecx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmC
+ movdqa xmmD, xmmH
+ sub ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD/8*4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .nextrow
+ movd XMM_DWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ alignx 16, 7
+
+.nextrow:
+ pop ecx
+ pop esi
+ pop ebx
+ pop edx
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ add edi, byte SIZEOF_JSAMPROW ; output_buf
+ dec eax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdcolor-avx2.asm b/media/libjpeg/simd/i386/jdcolor-avx2.asm
new file mode 100644
index 0000000000..e05b60d001
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-avx2.asm
@@ -0,0 +1,118 @@
+;
+; jdcolor.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402 times 16 dw F_0_402
+PW_MF0228 times 16 dw -F_0_228
+PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
+PW_ONE times 16 dw 1
+PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jdcolor-mmx.asm b/media/libjpeg/simd/i386/jdcolor-mmx.asm
new file mode 100644
index 0000000000..fb7e7bcce4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-mmx.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402 times 4 dw F_0_402
+PW_MF0228 times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE times 4 dw 1
+PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
+%include "jdcolext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jdcolor-sse2.asm b/media/libjpeg/simd/i386/jdcolor-sse2.asm
new file mode 100644
index 0000000000..b736255317
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-sse2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402 times 8 dw F_0_402
+PW_MF0228 times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE times 8 dw 1
+PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-avx2.asm b/media/libjpeg/simd/i386/jdmerge-avx2.asm
new file mode 100644
index 0000000000..711e6792d0
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-avx2.asm
@@ -0,0 +1,136 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402 times 16 dw F_0_402
+PW_MF0228 times 16 dw -F_0_228
+PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
+PW_ONE times 16 dw 1
+PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-mmx.asm b/media/libjpeg/simd/i386/jdmerge-mmx.asm
new file mode 100644
index 0000000000..6e8311d408
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-mmx.asm
@@ -0,0 +1,123 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_merged_upsample_mmx)
+
+EXTN(jconst_merged_upsample_mmx):
+
+PW_F0402 times 4 dw F_0_402
+PW_MF0228 times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE times 4 dw 1
+PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-sse2.asm b/media/libjpeg/simd/i386/jdmerge-sse2.asm
new file mode 100644
index 0000000000..e32f90aa17
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-sse2.asm
@@ -0,0 +1,135 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402 times 8 dw F_0_402
+PW_MF0228 times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE times 8 dw 1
+PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jdmrgext-avx2.asm b/media/libjpeg/simd/i386/jdmrgext-avx2.asm
new file mode 100644
index 0000000000..e35f7282bc
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-avx2.asm
@@ -0,0 +1,575 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b) + 8 ; JDIMENSION output_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+ ; ymmword wk[WK_NUM]
+%define WK_NUM 3
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [output_width(eax)] ; col
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [in_row_group_ctr(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
+ mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
+ mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+
+ pop ecx ; col
+
+ alignx 16, 7
+.columnloop:
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ vmovdqu ymm6, YMMWORD [ebx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+ vmovdqu ymm7, YMMWORD [edx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
+ vpcmpeqw ymm3, ymm3, ymm3
+ vpsllw ymm3, ymm3, 7 ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpermq ymm6, ymm6, 0xd8 ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+ vpermq ymm7, ymm7, 0xd8 ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+ vpunpcklbw ymm4, ymm6, ymm1 ; ymm4=Cb(0123456789ABCDEF)=CbL
+ vpunpckhbw ymm6, ymm6, ymm1 ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+ vpunpcklbw ymm0, ymm7, ymm1 ; ymm0=Cr(0123456789ABCDEF)=CrL
+ vpunpckhbw ymm7, ymm7, ymm1 ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+ vpaddw ymm5, ymm6, ymm3
+ vpaddw ymm2, ymm4, ymm3
+ vpaddw ymm1, ymm7, ymm3
+ vpaddw ymm3, ymm0, ymm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ vpaddw ymm6, ymm5, ymm5 ; ymm6=2*CbH
+ vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbL
+ vpaddw ymm7, ymm1, ymm1 ; ymm7=2*CrH
+ vpaddw ymm0, ymm3, ymm3 ; ymm0=2*CrL
+
+ vpmulhw ymm6, ymm6, [GOTOFF(eax,PW_MF0228)] ; ymm6=(2*CbH * -FIX(0.22800))
+ vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbL * -FIX(0.22800))
+ vpmulhw ymm7, ymm7, [GOTOFF(eax,PW_F0402)] ; ymm7=(2*CrH * FIX(0.40200))
+ vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrL * FIX(0.40200))
+
+ vpaddw ymm6, ymm6, [GOTOFF(eax,PW_ONE)]
+ vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+ vpsraw ymm6, ymm6, 1 ; ymm6=(CbH * -FIX(0.22800))
+ vpsraw ymm4, ymm4, 1 ; ymm4=(CbL * -FIX(0.22800))
+ vpaddw ymm7, ymm7, [GOTOFF(eax,PW_ONE)]
+ vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+ vpsraw ymm7, ymm7, 1 ; ymm7=(CrH * FIX(0.40200))
+ vpsraw ymm0, ymm0, 1 ; ymm0=(CrL * FIX(0.40200))
+
+ vpaddw ymm6, ymm6, ymm5
+ vpaddw ymm4, ymm4, ymm2
+ vpaddw ymm6, ymm6, ymm5 ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+ vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+ vpaddw ymm7, ymm7, ymm1 ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+ vpaddw ymm0, ymm0, ymm3 ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ vmovdqa YMMWORD [wk(0)], ymm6 ; wk(0)=(B-Y)H
+ vmovdqa YMMWORD [wk(1)], ymm7 ; wk(1)=(R-Y)H
+
+ vpunpckhwd ymm6, ymm5, ymm1
+ vpunpcklwd ymm5, ymm5, ymm1
+ vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+ vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)]
+ vpunpckhwd ymm7, ymm2, ymm3
+ vpunpcklwd ymm2, ymm2, ymm3
+ vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+ vpaddd ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)]
+ vpsrad ymm5, ymm5, SCALEBITS
+ vpsrad ymm6, ymm6, SCALEBITS
+ vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+ vpaddd ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)]
+ vpsrad ymm2, ymm2, SCALEBITS
+ vpsrad ymm7, ymm7, SCALEBITS
+
+ vpackssdw ymm5, ymm5, ymm6 ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ vpackssdw ymm2, ymm2, ymm7 ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ vpsubw ymm5, ymm5, ymm1 ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ vpsubw ymm2, ymm2, ymm3 ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ vmovdqa YMMWORD [wk(2)], ymm5 ; wk(2)=(G-Y)H
+
+ mov al, 2 ; Yctr
+ jmp short .Yloop_1st
+ alignx 16, 7
+
+.Yloop_2nd:
+ vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H
+ vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H
+ vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H
+ alignx 16, 7
+
+.Yloop_1st:
+ vmovdqu ymm7, YMMWORD [esi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpcmpeqw ymm6, ymm6, ymm6
+ vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+ vpand ymm6, ymm6, ymm7 ; ymm6=Y(02468ACEGIKMOQSU)=YE
+ vpsrlw ymm7, ymm7, BYTE_BIT ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+ vmovdqa ymm1, ymm0 ; ymm1=ymm0=(R-Y)(L/H)
+ vmovdqa ymm3, ymm2 ; ymm3=ymm2=(G-Y)(L/H)
+ vmovdqa ymm5, ymm4 ; ymm5=ymm4=(B-Y)(L/H)
+
+ vpaddw ymm0, ymm0, ymm6 ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+ vpaddw ymm1, ymm1, ymm7 ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+ vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
+ vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm2, ymm2, ymm6 ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+ vpaddw ymm3, ymm3, ymm7 ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+ vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
+ vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm4, ymm4, ymm6 ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+ vpaddw ymm5, ymm5, ymm7 ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+ vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
+ vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+ ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+ ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+ ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+ vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+ ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+ vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+ ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+ ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+ vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+ ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+ vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+ ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+ vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+ ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+ vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+ ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+ vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+ ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+ vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+ ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+ vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+ ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+ vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+ vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+ vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+ ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+ vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+ ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+ vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+ ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+ vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+ ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+ vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+ vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+ vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test edi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub ecx, byte SIZEOF_YMMWORD
+ jz near .endcolumn
+
+ add esi, byte SIZEOF_YMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_YMMWORD ; inptr1
+ add edx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st64:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_YMMWORD
+ jb short .column_st32
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmF
+ sub ecx, byte 2*SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st32:
+ cmp ecx, byte SIZEOF_YMMWORD
+ jb short .column_st31
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ add edi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub ecx, byte SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st31:
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ sub ecx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ vmovq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_MMWORD
+ sub ecx, byte SIZEOF_MMWORD
+ vpsrldq xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st3
+ vmovd XMM_DWORD [edi], xmmA
+ add edi, byte SIZEOF_DWORD
+ sub ecx, byte SIZEOF_DWORD
+ vpsrldq xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of eax to the output when it has enough
+ ; space.
+ vmovd eax, xmmA
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [edi], ax
+ add edi, byte SIZEOF_WORD
+ sub ecx, byte SIZEOF_WORD
+ shr eax, 16
+.column_st1:
+ ; Store the lower 1 byte of eax to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .endcolumn
+ mov byte [edi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+ vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+ ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+ ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+ vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+ ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+ ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+ vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+ ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+ ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+ vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+ ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+ vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+ ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+ vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+ vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+ vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ cmp ecx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test edi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+ vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+ vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+ add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub ecx, byte SIZEOF_YMMWORD
+ jz near .endcolumn
+
+ add esi, byte SIZEOF_YMMWORD ; inptr0
+ dec al
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_YMMWORD ; inptr1
+ add edx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st64:
+ cmp ecx, byte SIZEOF_YMMWORD/2
+ jb short .column_st32
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmC
+ vmovdqa ymmD, ymmH
+ sub ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+ cmp ecx, byte SIZEOF_YMMWORD/4
+ jb short .column_st16
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+ add edi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+ cmp ecx, byte SIZEOF_YMMWORD/8
+ jb short .column_st15
+ vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ sub ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+ ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_YMMWORD/16
+ jb short .column_st7
+ vmovq MMWORD [edi], xmmA
+ add edi, byte SIZEOF_YMMWORD/16*4
+ sub ecx, byte SIZEOF_YMMWORD/16
+ vpsrldq xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+ ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .endcolumn
+ vmovd XMM_DWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ sfence ; flush the write buffer
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b) + 8 ; JDIMENSION output_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov eax, POINTER [output_width(ebp)]
+
+ mov edi, JSAMPIMAGE [input_buf(ebp)]
+ mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(ebp)]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+ push edx ; inptr2
+ push ebx ; inptr1
+ push esi ; inptr00
+ mov ebx, esp
+
+ push edi ; output_buf (outptr0)
+ push ecx ; in_row_group_ctr
+ push ebx ; input_buf
+ push eax ; output_width
+
+ call near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+ add esi, byte SIZEOF_JSAMPROW ; inptr01
+ add edi, byte SIZEOF_JSAMPROW ; outptr1
+ mov POINTER [ebx+0*SIZEOF_POINTER], esi
+ mov POINTER [ebx-1*SIZEOF_POINTER], edi
+
+ call near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+ add esp, byte 7*SIZEOF_DWORD
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdmrgext-mmx.asm b/media/libjpeg/simd/i386/jdmrgext-mmx.asm
new file mode 100644
index 0000000000..eb3e36b475
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-mmx.asm
@@ -0,0 +1,460 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b) + 8 ; JDIMENSION output_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 3
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [output_width(eax)] ; col
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [in_row_group_ctr(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
+ mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
+ mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+
+ pop ecx ; col
+
+ alignx 16, 7
+.columnloop:
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ movq mm6, MMWORD [ebx] ; mm6=Cb(01234567)
+ movq mm7, MMWORD [edx] ; mm7=Cr(01234567)
+
+ pxor mm1, mm1 ; mm1=(all 0's)
+ pcmpeqw mm3, mm3
+ psllw mm3, 7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+ movq mm4, mm6
+ punpckhbw mm6, mm1 ; mm6=Cb(4567)=CbH
+ punpcklbw mm4, mm1 ; mm4=Cb(0123)=CbL
+ movq mm0, mm7
+ punpckhbw mm7, mm1 ; mm7=Cr(4567)=CrH
+ punpcklbw mm0, mm1 ; mm0=Cr(0123)=CrL
+
+ paddw mm6, mm3
+ paddw mm4, mm3
+ paddw mm7, mm3
+ paddw mm0, mm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movq mm5, mm6 ; mm5=CbH
+ movq mm2, mm4 ; mm2=CbL
+ paddw mm6, mm6 ; mm6=2*CbH
+ paddw mm4, mm4 ; mm4=2*CbL
+ movq mm1, mm7 ; mm1=CrH
+ movq mm3, mm0 ; mm3=CrL
+ paddw mm7, mm7 ; mm7=2*CrH
+ paddw mm0, mm0 ; mm0=2*CrL
+
+ pmulhw mm6, [GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800))
+ pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800))
+ pmulhw mm7, [GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200))
+ pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200))
+
+ paddw mm6, [GOTOFF(eax,PW_ONE)]
+ paddw mm4, [GOTOFF(eax,PW_ONE)]
+ psraw mm6, 1 ; mm6=(CbH * -FIX(0.22800))
+ psraw mm4, 1 ; mm4=(CbL * -FIX(0.22800))
+ paddw mm7, [GOTOFF(eax,PW_ONE)]
+ paddw mm0, [GOTOFF(eax,PW_ONE)]
+ psraw mm7, 1 ; mm7=(CrH * FIX(0.40200))
+ psraw mm0, 1 ; mm0=(CrL * FIX(0.40200))
+
+ paddw mm6, mm5
+ paddw mm4, mm2
+ paddw mm6, mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
+ paddw mm4, mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
+ paddw mm7, mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
+ paddw mm0, mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H
+ movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H
+
+ movq mm6, mm5
+ movq mm7, mm2
+ punpcklwd mm5, mm1
+ punpckhwd mm6, mm1
+ pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm6, [GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd mm2, mm3
+ punpckhwd mm7, mm3
+ pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd mm5, [GOTOFF(eax,PD_ONEHALF)]
+ paddd mm6, [GOTOFF(eax,PD_ONEHALF)]
+ psrad mm5, SCALEBITS
+ psrad mm6, SCALEBITS
+ paddd mm2, [GOTOFF(eax,PD_ONEHALF)]
+ paddd mm7, [GOTOFF(eax,PD_ONEHALF)]
+ psrad mm2, SCALEBITS
+ psrad mm7, SCALEBITS
+
+ packssdw mm5, mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ packssdw mm2, mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ psubw mm5, mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ psubw mm2, mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H
+
+ mov al, 2 ; Yctr
+ jmp short .Yloop_1st
+ alignx 16, 7
+
+.Yloop_2nd:
+ movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H
+ movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H
+ movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H
+ alignx 16, 7
+
+.Yloop_1st:
+ movq mm7, MMWORD [esi] ; mm7=Y(01234567)
+
+ pcmpeqw mm6, mm6
+ psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
+ pand mm6, mm7 ; mm6=Y(0246)=YE
+ psrlw mm7, BYTE_BIT ; mm7=Y(1357)=YO
+
+ movq mm1, mm0 ; mm1=mm0=(R-Y)(L/H)
+ movq mm3, mm2 ; mm3=mm2=(G-Y)(L/H)
+ movq mm5, mm4 ; mm5=mm4=(B-Y)(L/H)
+
+ paddw mm0, mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+ paddw mm1, mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+ packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
+ packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+ paddw mm2, mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+ paddw mm3, mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+ packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
+ packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+ paddw mm4, mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+ paddw mm5, mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+ packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
+ packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+ punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07)
+ punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27)
+
+ movq mmG, mmA
+ movq mmH, mmA
+ punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03)
+ punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07)
+
+ psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
+ psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
+
+ movq mmC, mmD
+ movq mmB, mmD
+ punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14)
+ punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --)
+
+ psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
+
+ movq mmF, mmE
+ punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25)
+ punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --)
+
+ punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12)
+ punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05)
+ punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz near .endcolumn
+
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st16:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_MMWORD
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq mmA, mmC
+ sub ecx, byte 2*SIZEOF_MMWORD
+ add edi, byte 2*SIZEOF_MMWORD
+ jmp short .column_st4
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA, mmE
+ sub ecx, byte SIZEOF_MMWORD
+ add edi, byte SIZEOF_MMWORD
+.column_st4:
+ movd eax, mmA
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st2
+ mov dword [edi+0*SIZEOF_DWORD], eax
+ psrlq mmA, DWORD_BIT
+ movd eax, mmA
+ sub ecx, byte SIZEOF_DWORD
+ add edi, byte SIZEOF_DWORD
+.column_st2:
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [edi+0*SIZEOF_WORD], ax
+ shr eax, WORD_BIT
+ sub ecx, byte SIZEOF_WORD
+ add edi, byte SIZEOF_WORD
+.column_st1:
+ cmp ecx, byte SIZEOF_BYTE
+ jb short .endcolumn
+ mov byte [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+ pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+ punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36)
+ punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17)
+ punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37)
+
+ movq mmC, mmA
+ punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32)
+ punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36)
+ movq mmG, mmB
+ punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33)
+ punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37)
+
+ movq mmD, mmA
+ punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31)
+ punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33)
+ movq mmH, mmC
+ punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35)
+ punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz short .endcolumn
+
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st16:
+ cmp ecx, byte SIZEOF_MMWORD/2
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq mmA, mmC
+ movq mmD, mmH
+ sub ecx, byte SIZEOF_MMWORD/2
+ add edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD/4
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA, mmD
+ sub ecx, byte SIZEOF_MMWORD/4
+ add edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+ cmp ecx, byte SIZEOF_MMWORD/8
+ jb short .endcolumn
+ movd dword [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b) + 8 ; JDIMENSION output_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov eax, JDIMENSION [output_width(ebp)]
+
+ mov edi, JSAMPIMAGE [input_buf(ebp)]
+ mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(ebp)]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+ push edx ; inptr2
+ push ebx ; inptr1
+ push esi ; inptr00
+ mov ebx, esp
+
+ push edi ; output_buf (outptr0)
+ push ecx ; in_row_group_ctr
+ push ebx ; input_buf
+ push eax ; output_width
+
+ call near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+ add esi, byte SIZEOF_JSAMPROW ; inptr01
+ add edi, byte SIZEOF_JSAMPROW ; outptr1
+ mov POINTER [ebx+0*SIZEOF_POINTER], esi
+ mov POINTER [ebx-1*SIZEOF_POINTER], edi
+
+ call near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+ add esp, byte 7*SIZEOF_DWORD
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdmrgext-sse2.asm b/media/libjpeg/simd/i386/jdmrgext-sse2.asm
new file mode 100644
index 0000000000..c113dc4d27
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-sse2.asm
@@ -0,0 +1,517 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b) + 8 ; JDIMENSION output_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 3
+%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [output_width(eax)] ; col
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [in_row_group_ctr(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
+ mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
+ mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+
+ pop ecx ; col
+
+ alignx 16, 7
+.columnloop:
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
+ movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
+
+ pxor xmm1, xmm1 ; xmm1=(all 0's)
+ pcmpeqw xmm3, xmm3
+ psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ movdqa xmm4, xmm6
+ punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
+ punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
+ movdqa xmm0, xmm7
+ punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
+ punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
+
+ paddw xmm6, xmm3
+ paddw xmm4, xmm3
+ paddw xmm7, xmm3
+ paddw xmm0, xmm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm5, xmm6 ; xmm5=CbH
+ movdqa xmm2, xmm4 ; xmm2=CbL
+ paddw xmm6, xmm6 ; xmm6=2*CbH
+ paddw xmm4, xmm4 ; xmm4=2*CbL
+ movdqa xmm1, xmm7 ; xmm1=CrH
+ movdqa xmm3, xmm0 ; xmm3=CrL
+ paddw xmm7, xmm7 ; xmm7=2*CrH
+ paddw xmm0, xmm0 ; xmm0=2*CrL
+
+ pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
+ pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
+ pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
+ pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
+
+ paddw xmm6, [GOTOFF(eax,PW_ONE)]
+ paddw xmm4, [GOTOFF(eax,PW_ONE)]
+ psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
+ psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
+ paddw xmm7, [GOTOFF(eax,PW_ONE)]
+ paddw xmm0, [GOTOFF(eax,PW_ONE)]
+ psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
+ psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
+
+ paddw xmm6, xmm5
+ paddw xmm4, xmm2
+ paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+ paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+ paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+ paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
+
+ movdqa xmm6, xmm5
+ movdqa xmm7, xmm2
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm6, xmm1
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd xmm2, xmm3
+ punpckhwd xmm7, xmm3
+ pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm6, [GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm5, SCALEBITS
+ psrad xmm6, SCALEBITS
+ paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm7, [GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm2, SCALEBITS
+ psrad xmm7, SCALEBITS
+
+ packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
+
+ mov al, 2 ; Yctr
+ jmp short .Yloop_1st
+ alignx 16, 7
+
+.Yloop_2nd:
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
+ alignx 16, 7
+
+.Yloop_1st:
+ movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm6, xmm6
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
+ psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
+
+ movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
+ movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
+ movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
+
+ paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+ paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+ packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+ paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+ packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+ paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+ packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG, xmmA
+ movdqa xmmH, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC, xmmD
+ movdqa xmmB, xmmD
+ punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF, xmmE
+ punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB, xmmE
+ punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB, xmmF
+ punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st32:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmF
+ sub ecx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub ecx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_MMWORD
+ sub ecx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd XMM_DWORD [edi], xmmA
+ add edi, byte SIZEOF_DWORD
+ sub ecx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of eax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [edi], ax
+ add edi, byte SIZEOF_WORD
+ sub ecx, byte SIZEOF_WORD
+ shr eax, 16
+.column_st1:
+ ; Store the lower 1 byte of eax to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .endcolumn
+ mov byte [edi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG, xmmB
+ punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH, xmmC
+ punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
+
+.column_st32:
+ cmp ecx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmC
+ movdqa xmmD, xmmH
+ sub ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD/8*4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .endcolumn
+ movd XMM_DWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ sfence ; flush the write buffer
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b) + 8 ; JDIMENSION output_width
+%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov eax, POINTER [output_width(ebp)]
+
+ mov edi, JSAMPIMAGE [input_buf(ebp)]
+ mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(ebp)]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+ push edx ; inptr2
+ push ebx ; inptr1
+ push esi ; inptr00
+ mov ebx, esp
+
+ push edi ; output_buf (outptr0)
+ push ecx ; in_row_group_ctr
+ push ebx ; input_buf
+ push eax ; output_width
+
+ call near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ add esi, byte SIZEOF_JSAMPROW ; inptr01
+ add edi, byte SIZEOF_JSAMPROW ; outptr1
+ mov POINTER [ebx+0*SIZEOF_POINTER], esi
+ mov POINTER [ebx-1*SIZEOF_POINTER], edi
+
+ call near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ add esp, byte 7*SIZEOF_DWORD
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdsample-avx2.asm b/media/libjpeg/simd/i386/jdsample-avx2.asm
new file mode 100644
index 0000000000..a800c35e08
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-avx2.asm
@@ -0,0 +1,760 @@
+;
+; jdsample.asm - upsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE times 16 dw 1
+PW_TWO times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+ push ebp
+ mov ebp, esp
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push eax ; colctr
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ test eax, SIZEOF_YMMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
+ vpcmpeqb xmm7, xmm7, xmm7
+ vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff
+ vpand ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+ add eax, byte SIZEOF_YMMWORD-1
+ and eax, byte -SIZEOF_YMMWORD
+ cmp eax, byte SIZEOF_YMMWORD
+ ja short .columnloop
+ alignx 16, 7
+
+.columnloop_last:
+ vpcmpeqb xmm6, xmm6, xmm6
+ vpslldq xmm6, xmm6, (SIZEOF_XMMWORD-1)
+ vperm2i128 ymm6, ymm6, ymm6, 1 ; (---- ---- ... ---- ---- ff) MSB is ff
+ vpand ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
+ jmp short .upsample
+ alignx 16, 7
+
+.columnloop:
+ vmovdqu ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
+ vperm2i128 ymm6, ymm0, ymm6, 0x20
+ vpslldq ymm6, ymm6, 15
+
+.upsample:
+ vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31)
+
+ vperm2i128 ymm2, ymm0, ymm1, 0x20
+ vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30)
+ vperm2i128 ymm4, ymm0, ymm1, 0x03
+ vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --)
+
+ vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30)
+ vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32)
+
+ vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --)
+
+ vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+ vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22)
+ vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24)
+ vpunpcklbw ymm0, ymm3, ymm0 ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+ vperm2i128 ymm3, ymm0, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
+ vperm2i128 ymm6, ymm0, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+ vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
+
+ vpmullw ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
+ vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+ vpaddw ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
+ vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
+ vpaddw ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
+ vpaddw ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
+
+ vpaddw ymm2, ymm2, ymm1
+ vpaddw ymm5, ymm5, ymm4
+ vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
+ vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+ vpaddw ymm3, ymm3, ymm1
+ vpaddw ymm6, ymm6, ymm4
+ vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
+ vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+ vpsllw ymm3, ymm3, BYTE_BIT
+ vpsllw ymm6, ymm6, BYTE_BIT
+ vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31)
+ vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
+
+ sub eax, byte SIZEOF_YMMWORD
+ add esi, byte 1*SIZEOF_YMMWORD ; inptr
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr
+ cmp eax, byte SIZEOF_YMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+ ; ymmword wk[WK_NUM]
+%define WK_NUM 4
+%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov edx, eax ; edx = original ebp
+ mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(edx)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(edx)] ; input_data
+ mov edi, POINTER [output_data_ptr(edx)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push eax ; colctr
+ push ecx
+ push edi
+ push esi
+
+ mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test eax, SIZEOF_YMMWORD-1
+ jz short .skip
+ push edx
+ mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop edx
+.skip:
+ ; -- process the first column block
+
+ vmovdqu ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0]
+ vmovdqu ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0]
+ vmovdqu ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's)
+
+ vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm3, ymm2, ymm3 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+ vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+ vpcmpeqb xmm7, xmm7, xmm7
+ vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff
+
+ vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+ vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save
+ vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
+
+ vpand ymm1, ymm1, ymm7 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vpand ymm2, ymm2, ymm7 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+ vmovdqa YMMWORD [wk(0)], ymm1
+ vmovdqa YMMWORD [wk(1)], ymm2
+
+ poppic ebx
+
+ add eax, byte SIZEOF_YMMWORD-1
+ and eax, byte -SIZEOF_YMMWORD
+ cmp eax, byte SIZEOF_YMMWORD
+ ja short .columnloop
+ alignx 16, 7
+
+.columnloop_last:
+ ; -- process the last column block
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ vpcmpeqb xmm1, xmm1, xmm1
+ vpslldq xmm1, xmm1, (SIZEOF_XMMWORD-2)
+ vperm2i128 ymm1, ymm1, ymm1, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+ vpand ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
+ vpand ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
+
+ vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+ vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+ jmp near .upsample
+ alignx 16, 7
+
+.columnloop:
+ ; -- process the next column block
+
+ vmovdqu ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1]
+ vmovdqu ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1]
+ vmovdqu ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's)
+
+ vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm7, ymm2, ymm3 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+ vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+ vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+ vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vmovdqu YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save
+ vmovdqu YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data
+ vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
+ vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
+
+ vperm2i128 ymm1, ymm3, ymm1, 0x20
+ vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
+ vperm2i128 ymm2, ymm3, ymm2, 0x20
+ vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
+
+ vmovdqa YMMWORD [wk(2)], ymm1
+ vmovdqa YMMWORD [wk(3)], ymm2
+
+.upsample:
+ ; -- process the upper row
+
+ vmovdqu ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vmovdqu ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
+
+ vperm2i128 ymm0, ymm1, ymm7, 0x03
+ vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
+ vperm2i128 ymm4, ymm1, ymm3, 0x20
+ vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+ vperm2i128 ymm5, ymm1, ymm7, 0x03
+ vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm6, ymm1, ymm3, 0x20
+ vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
+ vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vperm2i128 ymm2, ymm1, ymm3, 0x03
+ vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+ vperm2i128 ymm4, ymm1, ymm3, 0x03
+ vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm1, ymm1, ymm7, 0x20
+ vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+
+ vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+ vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+ vmovdqa YMMWORD [wk(0)], ymm4
+
+ vpmullw ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
+ vpmullw ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
+ vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+ vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
+ vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
+ vpaddw ymm2, [GOTOFF(ebx,PW_SEVEN)]
+
+ vpaddw ymm1, ymm1, ymm7
+ vpaddw ymm5, ymm5, ymm3
+ vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
+ vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm2, ymm2, ymm3
+ vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
+ vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+ vpsllw ymm0, ymm0, BYTE_BIT
+ vpsllw ymm2, ymm2, BYTE_BIT
+ vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31)
+ vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+ vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
+ vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
+
+ ; -- process the lower row
+
+ vmovdqu ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vmovdqu ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
+
+ vperm2i128 ymm7, ymm1, ymm6, 0x03
+ vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
+ vperm2i128 ymm3, ymm1, ymm4, 0x20
+ vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+ vperm2i128 ymm0, ymm1, ymm6, 0x03
+ vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm2, ymm1, ymm4, 0x20
+ vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
+ vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vperm2i128 ymm5, ymm1, ymm4, 0x03
+ vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+ vperm2i128 ymm3, ymm1, ymm4, 0x03
+ vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm1, ymm1, ymm6, 0x20
+ vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+
+ vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+ vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+ vmovdqa YMMWORD [wk(1)], ymm3
+
+ vpmullw ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
+ vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+ vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+ vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
+ vpaddw ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
+ vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
+
+ vpaddw ymm1, ymm1, ymm6
+ vpaddw ymm0, ymm0, ymm4
+ vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
+ vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+ vpaddw ymm7, ymm7, ymm6
+ vpaddw ymm5, ymm5, ymm4
+ vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
+ vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+ vpsllw ymm7, ymm7, BYTE_BIT
+ vpsllw ymm5, ymm5, BYTE_BIT
+ vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31)
+ vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
+
+ poppic ebx
+
+ sub eax, byte SIZEOF_YMMWORD
+ add ecx, byte 1*SIZEOF_YMMWORD ; inptr1(above)
+ add ebx, byte 1*SIZEOF_YMMWORD ; inptr0
+ add esi, byte 1*SIZEOF_YMMWORD ; inptr1(below)
+ add edx, byte 2*SIZEOF_YMMWORD ; outptr0
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr1
+ cmp eax, byte SIZEOF_YMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop ecx
+ pop eax
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define output_width(b) (b) + 12 ; JDIMENSION output_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (SIZEOF_YMMWORD-1)
+ and edx, -SIZEOF_YMMWORD
+ jz short .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz short .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+ mov eax, edx ; colctr
+ alignx 16, 7
+.columnloop:
+
+ cmp eax, byte SIZEOF_YMMWORD
+ ja near .above_16
+
+ vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
+ vpunpckhbw xmm1, xmm0, xmm0
+ vpunpcklbw xmm0, xmm0, xmm0
+
+ vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+ vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+ jmp short .nextrow
+
+.above_16:
+ vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+ vpermq ymm0, ymm0, 0xd8
+ vpunpckhbw ymm1, ymm0, ymm0
+ vpunpcklbw ymm0, ymm0, ymm0
+
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+ sub eax, byte 2*SIZEOF_YMMWORD
+ jz short .nextrow
+
+ add esi, byte SIZEOF_YMMWORD ; inptr
+ add edi, byte 2*SIZEOF_YMMWORD ; outptr
+ jmp short .columnloop
+ alignx 16, 7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg short .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define output_width(b) (b) + 12 ; JDIMENSION output_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (SIZEOF_YMMWORD-1)
+ and edx, -SIZEOF_YMMWORD
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov eax, edx ; colctr
+ alignx 16, 7
+.columnloop:
+
+ cmp eax, byte SIZEOF_YMMWORD
+ ja short .above_16
+
+ vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ vpunpckhbw xmm1, xmm0, xmm0
+ vpunpcklbw xmm0, xmm0, xmm0
+
+ vmovdqu XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+ vmovdqu XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+ vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+ vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+ jmp near .nextrow
+
+.above_16:
+ vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+ vpermq ymm0, ymm0, 0xd8
+ vpunpckhbw ymm1, ymm0, ymm0
+ vpunpcklbw ymm0, ymm0, ymm0
+
+ vmovdqu YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
+ vmovdqu YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
+ vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+ vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+ sub eax, byte 2*SIZEOF_YMMWORD
+ jz short .nextrow
+
+ add esi, byte SIZEOF_YMMWORD ; inptr
+ add ebx, 2*SIZEOF_YMMWORD ; outptr0
+ add edi, 2*SIZEOF_YMMWORD ; outptr1
+ jmp short .columnloop
+ alignx 16, 7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdsample-mmx.asm b/media/libjpeg/simd/i386/jdsample-mmx.asm
new file mode 100644
index 0000000000..12c49f0eab
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-mmx.asm
@@ -0,0 +1,731 @@
+;
+; jdsample.asm - upsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE times 4 dw 1
+PW_TWO times 4 dw 2
+PW_THREE times 4 dw 3
+PW_SEVEN times 4 dw 7
+PW_EIGHT times 4 dw 8
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v1_fancy_upsample_mmx):
+ push ebp
+ mov ebp, esp
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push eax ; colctr
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ test eax, SIZEOF_MMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ pxor mm0, mm0 ; mm0=(all 0's)
+ pcmpeqb mm7, mm7
+ psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
+ pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]
+
+ add eax, byte SIZEOF_MMWORD-1
+ and eax, byte -SIZEOF_MMWORD
+ cmp eax, byte SIZEOF_MMWORD
+ ja short .columnloop
+ alignx 16, 7
+
+.columnloop_last:
+ pcmpeqb mm6, mm6
+ psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+ pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+ jmp short .upsample
+ alignx 16, 7
+
+.columnloop:
+ movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+ psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+ movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mm2, mm1
+ movq mm3, mm1 ; mm1=( 0 1 2 3 4 5 6 7)
+ psllq mm2, BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)
+ psrlq mm3, BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)
+
+ por mm2, mm7 ; mm2=(-1 0 1 2 3 4 5 6)
+ por mm3, mm6 ; mm3=( 1 2 3 4 5 6 7 8)
+
+ movq mm7, mm1
+ psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)
+
+ movq mm4, mm1
+ punpcklbw mm1, mm0 ; mm1=( 0 1 2 3)
+ punpckhbw mm4, mm0 ; mm4=( 4 5 6 7)
+ movq mm5, mm2
+ punpcklbw mm2, mm0 ; mm2=(-1 0 1 2)
+ punpckhbw mm5, mm0 ; mm5=( 3 4 5 6)
+ movq mm6, mm3
+ punpcklbw mm3, mm0 ; mm3=( 1 2 3 4)
+ punpckhbw mm6, mm0 ; mm6=( 5 6 7 8)
+
+ pmullw mm1, [GOTOFF(ebx,PW_THREE)]
+ pmullw mm4, [GOTOFF(ebx,PW_THREE)]
+ paddw mm2, [GOTOFF(ebx,PW_ONE)]
+ paddw mm5, [GOTOFF(ebx,PW_ONE)]
+ paddw mm3, [GOTOFF(ebx,PW_TWO)]
+ paddw mm6, [GOTOFF(ebx,PW_TWO)]
+
+ paddw mm2, mm1
+ paddw mm5, mm4
+ psrlw mm2, 2 ; mm2=OutLE=( 0 2 4 6)
+ psrlw mm5, 2 ; mm5=OutHE=( 8 10 12 14)
+ paddw mm3, mm1
+ paddw mm6, mm4
+ psrlw mm3, 2 ; mm3=OutLO=( 1 3 5 7)
+ psrlw mm6, 2 ; mm6=OutHO=( 9 11 13 15)
+
+ psllw mm3, BYTE_BIT
+ psllw mm6, BYTE_BIT
+ por mm2, mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)
+ por mm5, mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+ sub eax, byte SIZEOF_MMWORD
+ add esi, byte 1*SIZEOF_MMWORD ; inptr
+ add edi, byte 2*SIZEOF_MMWORD ; outptr
+ cmp eax, byte SIZEOF_MMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 4
+%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v2_fancy_upsample_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov edx, eax ; edx = original ebp
+ mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(edx)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(edx)] ; input_data
+ mov edi, POINTER [output_data_ptr(edx)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push eax ; colctr
+ push ecx
+ push edi
+ push esi
+
+ mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test eax, SIZEOF_MMWORD-1
+ jz short .skip
+ push edx
+ mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop edx
+.skip:
+ ; -- process the first column block
+
+ movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]
+ movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
+ movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pxor mm3, mm3 ; mm3=(all 0's)
+ movq mm4, mm0
+ punpcklbw mm0, mm3 ; mm0=row[ 0][0]( 0 1 2 3)
+ punpckhbw mm4, mm3 ; mm4=row[ 0][0]( 4 5 6 7)
+ movq mm5, mm1
+ punpcklbw mm1, mm3 ; mm1=row[-1][0]( 0 1 2 3)
+ punpckhbw mm5, mm3 ; mm5=row[-1][0]( 4 5 6 7)
+ movq mm6, mm2
+ punpcklbw mm2, mm3 ; mm2=row[+1][0]( 0 1 2 3)
+ punpckhbw mm6, mm3 ; mm6=row[+1][0]( 4 5 6 7)
+
+ pmullw mm0, [GOTOFF(ebx,PW_THREE)]
+ pmullw mm4, [GOTOFF(ebx,PW_THREE)]
+
+ pcmpeqb mm7, mm7
+ psrlq mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
+
+ paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3)
+ paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7)
+ paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3)
+ paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7)
+
+ movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save
+ movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+ pand mm1, mm7 ; mm1=( 0 - - -)
+ pand mm2, mm7 ; mm2=( 0 - - -)
+
+ movq MMWORD [wk(0)], mm1
+ movq MMWORD [wk(1)], mm2
+
+ poppic ebx
+
+ add eax, byte SIZEOF_MMWORD-1
+ and eax, byte -SIZEOF_MMWORD
+ cmp eax, byte SIZEOF_MMWORD
+ ja short .columnloop
+ alignx 16, 7
+
+.columnloop_last:
+ ; -- process the last column block
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pcmpeqb mm1, mm1
+ psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
+ movq mm2, mm1
+
+ pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)
+ pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)
+
+ movq MMWORD [wk(2)], mm1
+ movq MMWORD [wk(3)], mm2
+
+ jmp short .upsample
+ alignx 16, 7
+
+.columnloop:
+ ; -- process the next column block
+
+ movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]
+ movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
+ movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pxor mm3, mm3 ; mm3=(all 0's)
+ movq mm4, mm0
+ punpcklbw mm0, mm3 ; mm0=row[ 0][1]( 0 1 2 3)
+ punpckhbw mm4, mm3 ; mm4=row[ 0][1]( 4 5 6 7)
+ movq mm5, mm1
+ punpcklbw mm1, mm3 ; mm1=row[-1][1]( 0 1 2 3)
+ punpckhbw mm5, mm3 ; mm5=row[-1][1]( 4 5 6 7)
+ movq mm6, mm2
+ punpcklbw mm2, mm3 ; mm2=row[+1][1]( 0 1 2 3)
+ punpckhbw mm6, mm3 ; mm6=row[+1][1]( 4 5 6 7)
+
+ pmullw mm0, [GOTOFF(ebx,PW_THREE)]
+ pmullw mm4, [GOTOFF(ebx,PW_THREE)]
+
+ paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3)
+ paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7)
+ paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3)
+ paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7)
+
+ movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save
+ movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+ psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)
+ psllq mm2, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)
+
+ movq MMWORD [wk(2)], mm1
+ movq MMWORD [wk(3)], mm2
+
+.upsample:
+ ; -- process the upper row
+
+ movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)
+ movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)
+
+ movq mm0, mm7
+ movq mm4, mm3
+ psrlq mm0, 2*BYTE_BIT ; mm0=( 1 2 3 -)
+ psllq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)
+ movq mm5, mm7
+ movq mm6, mm3
+ psrlq mm5, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)
+ psllq mm6, 2*BYTE_BIT ; mm6=( - 4 5 6)
+
+ por mm0, mm4 ; mm0=( 1 2 3 4)
+ por mm5, mm6 ; mm5=( 3 4 5 6)
+
+ movq mm1, mm7
+ movq mm2, mm3
+ psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2)
+ psrlq mm2, 2*BYTE_BIT ; mm2=( 5 6 7 -)
+ movq mm4, mm3
+ psrlq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)
+
+ por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)
+ por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)
+
+ movq MMWORD [wk(0)], mm4
+
+ pmullw mm7, [GOTOFF(ebx,PW_THREE)]
+ pmullw mm3, [GOTOFF(ebx,PW_THREE)]
+ paddw mm1, [GOTOFF(ebx,PW_EIGHT)]
+ paddw mm5, [GOTOFF(ebx,PW_EIGHT)]
+ paddw mm0, [GOTOFF(ebx,PW_SEVEN)]
+ paddw mm2, [GOTOFF(ebx,PW_SEVEN)]
+
+ paddw mm1, mm7
+ paddw mm5, mm3
+ psrlw mm1, 4 ; mm1=Out0LE=( 0 2 4 6)
+ psrlw mm5, 4 ; mm5=Out0HE=( 8 10 12 14)
+ paddw mm0, mm7
+ paddw mm2, mm3
+ psrlw mm0, 4 ; mm0=Out0LO=( 1 3 5 7)
+ psrlw mm2, 4 ; mm2=Out0HO=( 9 11 13 15)
+
+ psllw mm0, BYTE_BIT
+ psllw mm2, BYTE_BIT
+ por mm1, mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)
+ por mm5, mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)
+
+ movq MMWORD [edx+0*SIZEOF_MMWORD], mm1
+ movq MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+ ; -- process the lower row
+
+ movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)
+ movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)
+
+ movq mm7, mm6
+ movq mm3, mm4
+ psrlq mm7, 2*BYTE_BIT ; mm7=( 1 2 3 -)
+ psllq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)
+ movq mm0, mm6
+ movq mm2, mm4
+ psrlq mm0, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)
+ psllq mm2, 2*BYTE_BIT ; mm2=( - 4 5 6)
+
+ por mm7, mm3 ; mm7=( 1 2 3 4)
+ por mm0, mm2 ; mm0=( 3 4 5 6)
+
+ movq mm1, mm6
+ movq mm5, mm4
+ psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2)
+ psrlq mm5, 2*BYTE_BIT ; mm5=( 5 6 7 -)
+ movq mm3, mm4
+ psrlq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)
+
+ por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)
+ por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)
+
+ movq MMWORD [wk(1)], mm3
+
+ pmullw mm6, [GOTOFF(ebx,PW_THREE)]
+ pmullw mm4, [GOTOFF(ebx,PW_THREE)]
+ paddw mm1, [GOTOFF(ebx,PW_EIGHT)]
+ paddw mm0, [GOTOFF(ebx,PW_EIGHT)]
+ paddw mm7, [GOTOFF(ebx,PW_SEVEN)]
+ paddw mm5, [GOTOFF(ebx,PW_SEVEN)]
+
+ paddw mm1, mm6
+ paddw mm0, mm4
+ psrlw mm1, 4 ; mm1=Out1LE=( 0 2 4 6)
+ psrlw mm0, 4 ; mm0=Out1HE=( 8 10 12 14)
+ paddw mm7, mm6
+ paddw mm5, mm4
+ psrlw mm7, 4 ; mm7=Out1LO=( 1 3 5 7)
+ psrlw mm5, 4 ; mm5=Out1HO=( 9 11 13 15)
+
+ psllw mm7, BYTE_BIT
+ psllw mm5, BYTE_BIT
+ por mm1, mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)
+ por mm0, mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+ poppic ebx
+
+ sub eax, byte SIZEOF_MMWORD
+ add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
+ add ebx, byte 1*SIZEOF_MMWORD ; inptr0
+ add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)
+ add edx, byte 2*SIZEOF_MMWORD ; outptr0
+ add edi, byte 2*SIZEOF_MMWORD ; outptr1
+ cmp eax, byte SIZEOF_MMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop ecx
+ pop eax
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define output_width(b) (b) + 12 ; JDIMENSION output_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
+
+EXTN(jsimd_h2v1_upsample_mmx):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_MMWORD)-1
+ and edx, byte -(2*SIZEOF_MMWORD)
+ jz short .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz short .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+ mov eax, edx ; colctr
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+ movq mm1, mm0
+ punpcklbw mm0, mm0
+ punpckhbw mm1, mm1
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+ sub eax, byte 2*SIZEOF_MMWORD
+ jz short .nextrow
+
+ movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+ movq mm3, mm2
+ punpcklbw mm2, mm2
+ punpckhbw mm3, mm3
+
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+ sub eax, byte 2*SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte 2*SIZEOF_MMWORD ; inptr
+ add edi, byte 4*SIZEOF_MMWORD ; outptr
+ jmp short .columnloop
+ alignx 16, 7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg short .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define output_width(b) (b) + 12 ; JDIMENSION output_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
+
+EXTN(jsimd_h2v2_upsample_mmx):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_MMWORD)-1
+ and edx, byte -(2*SIZEOF_MMWORD)
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz short .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov eax, edx ; colctr
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+ movq mm1, mm0
+ punpcklbw mm0, mm0
+ punpckhbw mm1, mm1
+
+ movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+ movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+ sub eax, byte 2*SIZEOF_MMWORD
+ jz short .nextrow
+
+ movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+ movq mm3, mm2
+ punpcklbw mm2, mm2
+ punpckhbw mm3, mm3
+
+ movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+ movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+ sub eax, byte 2*SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte 2*SIZEOF_MMWORD ; inptr
+ add ebx, byte 4*SIZEOF_MMWORD ; outptr0
+ add edi, byte 4*SIZEOF_MMWORD ; outptr1
+ jmp short .columnloop
+ alignx 16, 7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg short .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jdsample-sse2.asm b/media/libjpeg/simd/i386/jdsample-sse2.asm
new file mode 100644
index 0000000000..4e28d2f4b8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-sse2.asm
@@ -0,0 +1,724 @@
+;
+; jdsample.asm - upsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE times 8 dw 1
+PW_TWO times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+ push ebp
+ mov ebp, esp
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push eax ; colctr
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ test eax, SIZEOF_XMMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ pxor xmm0, xmm0 ; xmm0=(all 0's)
+ pcmpeqb xmm7, xmm7
+ psrldq xmm7, (SIZEOF_XMMWORD-1)
+ pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+ add eax, byte SIZEOF_XMMWORD-1
+ and eax, byte -SIZEOF_XMMWORD
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+ alignx 16, 7
+
+.columnloop_last:
+ pcmpeqb xmm6, xmm6
+ pslldq xmm6, (SIZEOF_XMMWORD-1)
+ pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ jmp short .upsample
+ alignx 16, 7
+
+.columnloop:
+ movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ pslldq xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+ movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
+ pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
+ psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
+
+ por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
+ por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
+
+ movdqa xmm7, xmm1
+ psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
+
+ movdqa xmm4, xmm1
+ punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm2
+ punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
+ punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
+ movdqa xmm6, xmm3
+ punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
+ punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
+
+ pmullw xmm1, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
+ paddw xmm2, [GOTOFF(ebx,PW_ONE)]
+ paddw xmm5, [GOTOFF(ebx,PW_ONE)]
+ paddw xmm3, [GOTOFF(ebx,PW_TWO)]
+ paddw xmm6, [GOTOFF(ebx,PW_TWO)]
+
+ paddw xmm2, xmm1
+ paddw xmm5, xmm4
+ psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+ paddw xmm3, xmm1
+ paddw xmm6, xmm4
+ psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm3, BYTE_BIT
+ psllw xmm6, BYTE_BIT
+ por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
+ por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+ sub eax, byte SIZEOF_XMMWORD
+ add esi, byte 1*SIZEOF_XMMWORD ; inptr
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ cmp eax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 4
+%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov edx, eax ; edx = original ebp
+ mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(edx)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(edx)] ; input_data
+ mov edi, POINTER [output_data_ptr(edx)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push eax ; colctr
+ push ecx
+ push edi
+ push esi
+
+ mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test eax, SIZEOF_XMMWORD-1
+ jz short .skip
+ push edx
+ mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop edx
+.skip:
+ ; -- process the first column block
+
+ movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
+ movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
+ movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pxor xmm3, xmm3 ; xmm3=(all 0's)
+ movdqa xmm4, xmm0
+ punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
+
+ pcmpeqb xmm7, xmm7
+ psrldq xmm7, (SIZEOF_XMMWORD-2)
+
+ paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+ pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
+ pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
+
+ movdqa XMMWORD [wk(0)], xmm1
+ movdqa XMMWORD [wk(1)], xmm2
+
+ poppic ebx
+
+ add eax, byte SIZEOF_XMMWORD-1
+ and eax, byte -SIZEOF_XMMWORD
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+ alignx 16, 7
+
+.columnloop_last:
+ ; -- process the last column block
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pcmpeqb xmm1, xmm1
+ pslldq xmm1, (SIZEOF_XMMWORD-2)
+ movdqa xmm2, xmm1
+
+ pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+ pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+ movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
+ movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
+
+ jmp near .upsample
+ alignx 16, 7
+
+.columnloop:
+ ; -- process the next column block
+
+ movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
+ movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
+ movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pxor xmm3, xmm3 ; xmm3=(all 0's)
+ movdqa xmm4, xmm0
+ punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
+
+ paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+ pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
+ pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
+
+ movdqa XMMWORD [wk(2)], xmm1
+ movdqa XMMWORD [wk(3)], xmm2
+
+.upsample:
+ ; -- process the upper row
+
+ movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+ movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
+ pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm3
+ psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
+ pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
+
+ por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
+ por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1, xmm7
+ movdqa xmm2, xmm3
+ pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
+ movdqa xmm4, xmm3
+ psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(0)], xmm4
+
+ pmullw xmm7, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm3, [GOTOFF(ebx,PW_THREE)]
+ paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm5, [GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm0, [GOTOFF(ebx,PW_SEVEN)]
+ paddw xmm2, [GOTOFF(ebx,PW_SEVEN)]
+
+ paddw xmm1, xmm7
+ paddw xmm5, xmm3
+ psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+ paddw xmm0, xmm7
+ paddw xmm2, xmm3
+ psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm0, BYTE_BIT
+ psllw xmm2, BYTE_BIT
+ por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
+ por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+ ; -- process the lower row
+
+ movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+ movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
+ pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
+ movdqa xmm0, xmm6
+ movdqa xmm2, xmm4
+ psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
+ pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
+
+ por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
+ por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm4
+ pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
+ movdqa xmm3, xmm4
+ psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(1)], xmm3
+
+ pmullw xmm6, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
+ paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm0, [GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm7, [GOTOFF(ebx,PW_SEVEN)]
+ paddw xmm5, [GOTOFF(ebx,PW_SEVEN)]
+
+ paddw xmm1, xmm6
+ paddw xmm0, xmm4
+ psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+ paddw xmm7, xmm6
+ paddw xmm5, xmm4
+ psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm7, BYTE_BIT
+ psllw xmm5, BYTE_BIT
+ por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
+ por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+ poppic ebx
+
+ sub eax, byte SIZEOF_XMMWORD
+ add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
+ add ebx, byte 1*SIZEOF_XMMWORD ; inptr0
+ add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
+ add edx, byte 2*SIZEOF_XMMWORD ; outptr0
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr1
+ cmp eax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop ecx
+ pop eax
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define output_width(b) (b) + 12 ; JDIMENSION output_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_XMMWORD)-1
+ and edx, byte -(2*SIZEOF_XMMWORD)
+ jz short .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz short .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+ mov eax, edx ; colctr
+ alignx 16, 7
+.columnloop:
+
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm3, xmm3
+
+ movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr
+ add edi, byte 4*SIZEOF_XMMWORD ; outptr
+ jmp short .columnloop
+ alignx 16, 7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg short .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
+%define output_width(b) (b) + 12 ; JDIMENSION output_width
+%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_XMMWORD)-1
+ and edx, byte -(2*SIZEOF_XMMWORD)
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov eax, edx ; colctr
+ alignx 16, 7
+.columnloop:
+
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+
+ movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm3, xmm3
+
+ movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+ movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr
+ add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
+ add edi, byte 4*SIZEOF_XMMWORD ; outptr1
+ jmp short .columnloop
+ alignx 16, 7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg short .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctflt-3dn.asm b/media/libjpeg/simd/i386/jfdctflt-3dn.asm
new file mode 100644
index 0000000000..322ab16325
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctflt-3dn.asm
@@ -0,0 +1,318 @@
+;
+; jfdctflt.asm - floating-point FDCT (3DNow!)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382 times 2 dd 0.382683432365089771728460
+PD_0_707 times 2 dd 0.707106781186547524400844
+PD_0_541 times 2 dd 0.541196100146196984399723
+PD_1_306 times 2 dd 1.306562964876376527856643
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_3dnow(FAST_FLOAT *data)
+;
+
+%define data(b) (b) + 8 ; FAST_FLOAT *data
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_float_3dnow)
+
+EXTN(jsimd_fdct_float_3dnow):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/2
+ alignx 16, 7
+.rowloop:
+
+ movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+ ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+ movq mm4, mm0 ; transpose coefficients
+ punpckldq mm0, mm1 ; mm0=(00 10)=data0
+ punpckhdq mm4, mm1 ; mm4=(01 11)=data1
+ movq mm5, mm2 ; transpose coefficients
+ punpckldq mm2, mm3 ; mm2=(06 16)=data6
+ punpckhdq mm5, mm3 ; mm5=(07 17)=data7
+
+ movq mm6, mm4
+ movq mm7, mm0
+ pfsub mm4, mm2 ; mm4=data1-data6=tmp6
+ pfsub mm0, mm5 ; mm0=data0-data7=tmp7
+ pfadd mm6, mm2 ; mm6=data1+data6=tmp1
+ pfadd mm7, mm5 ; mm7=data0+data7=tmp0
+
+ movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+ movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+ ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
+
+ movq mm4, mm1 ; transpose coefficients
+ punpckldq mm1, mm3 ; mm1=(02 12)=data2
+ punpckhdq mm4, mm3 ; mm4=(03 13)=data3
+ movq mm0, mm2 ; transpose coefficients
+ punpckldq mm2, mm5 ; mm2=(04 14)=data4
+ punpckhdq mm0, mm5 ; mm0=(05 15)=data5
+
+ movq mm3, mm4
+ movq mm5, mm1
+ pfadd mm4, mm2 ; mm4=data3+data4=tmp3
+ pfadd mm1, mm0 ; mm1=data2+data5=tmp2
+ pfsub mm3, mm2 ; mm3=data3-data4=tmp4
+ pfsub mm5, mm0 ; mm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm2, mm7
+ movq mm0, mm6
+ pfsub mm7, mm4 ; mm7=tmp13
+ pfsub mm6, mm1 ; mm6=tmp12
+ pfadd mm2, mm4 ; mm2=tmp10
+ pfadd mm0, mm1 ; mm0=tmp11
+
+ pfadd mm6, mm7
+ pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+ movq mm4, mm2
+ movq mm1, mm7
+ pfsub mm2, mm0 ; mm2=data4
+ pfsub mm7, mm6 ; mm7=data6
+ pfadd mm4, mm0 ; mm4=data0
+ pfadd mm1, mm6 ; mm1=data2
+
+ movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+ movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [wk(0)] ; mm0=tmp6
+ movq mm6, MMWORD [wk(1)] ; mm6=tmp7
+
+ pfadd mm3, mm5 ; mm3=tmp10
+ pfadd mm5, mm0 ; mm5=tmp11
+ pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7
+
+ pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+ movq mm2, mm3 ; mm2=tmp10
+ pfsub mm3, mm0
+ pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5
+ pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+ pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+ pfadd mm2, mm3 ; mm2=z2
+ pfadd mm0, mm3 ; mm0=z4
+
+ movq mm7, mm6
+ pfsub mm6, mm5 ; mm6=z13
+ pfadd mm7, mm5 ; mm7=z11
+
+ movq mm4, mm6
+ movq mm1, mm7
+ pfsub mm6, mm2 ; mm6=data3
+ pfsub mm7, mm0 ; mm7=data7
+ pfadd mm4, mm2 ; mm4=data5
+ pfadd mm1, mm0 ; mm1=data1
+
+ movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+ movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+ add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/2
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+ movq mm4, mm0 ; transpose coefficients
+ punpckldq mm0, mm1 ; mm0=(00 01)=data0
+ punpckhdq mm4, mm1 ; mm4=(10 11)=data1
+ movq mm5, mm2 ; transpose coefficients
+ punpckldq mm2, mm3 ; mm2=(60 61)=data6
+ punpckhdq mm5, mm3 ; mm5=(70 71)=data7
+
+ movq mm6, mm4
+ movq mm7, mm0
+ pfsub mm4, mm2 ; mm4=data1-data6=tmp6
+ pfsub mm0, mm5 ; mm0=data0-data7=tmp7
+ pfadd mm6, mm2 ; mm6=data1+data6=tmp1
+ pfadd mm7, mm5 ; mm7=data0+data7=tmp0
+
+ movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
+
+ movq mm4, mm1 ; transpose coefficients
+ punpckldq mm1, mm3 ; mm1=(20 21)=data2
+ punpckhdq mm4, mm3 ; mm4=(30 31)=data3
+ movq mm0, mm2 ; transpose coefficients
+ punpckldq mm2, mm5 ; mm2=(40 41)=data4
+ punpckhdq mm0, mm5 ; mm0=(50 51)=data5
+
+ movq mm3, mm4
+ movq mm5, mm1
+ pfadd mm4, mm2 ; mm4=data3+data4=tmp3
+ pfadd mm1, mm0 ; mm1=data2+data5=tmp2
+ pfsub mm3, mm2 ; mm3=data3-data4=tmp4
+ pfsub mm5, mm0 ; mm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm2, mm7
+ movq mm0, mm6
+ pfsub mm7, mm4 ; mm7=tmp13
+ pfsub mm6, mm1 ; mm6=tmp12
+ pfadd mm2, mm4 ; mm2=tmp10
+ pfadd mm0, mm1 ; mm0=tmp11
+
+ pfadd mm6, mm7
+ pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+ movq mm4, mm2
+ movq mm1, mm7
+ pfsub mm2, mm0 ; mm2=data4
+ pfsub mm7, mm6 ; mm7=data6
+ pfadd mm4, mm0 ; mm4=data0
+ pfadd mm1, mm6 ; mm1=data2
+
+ movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+ movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [wk(0)] ; mm0=tmp6
+ movq mm6, MMWORD [wk(1)] ; mm6=tmp7
+
+ pfadd mm3, mm5 ; mm3=tmp10
+ pfadd mm5, mm0 ; mm5=tmp11
+ pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7
+
+ pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+ movq mm2, mm3 ; mm2=tmp10
+ pfsub mm3, mm0
+ pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5
+ pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+ pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+ pfadd mm2, mm3 ; mm2=z2
+ pfadd mm0, mm3 ; mm0=z4
+
+ movq mm7, mm6
+ pfsub mm6, mm5 ; mm6=z13
+ pfadd mm7, mm5 ; mm7=z11
+
+ movq mm4, mm6
+ movq mm1, mm7
+ pfsub mm6, mm2 ; mm6=data3
+ pfsub mm7, mm0 ; mm7=data7
+ pfadd mm4, mm2 ; mm4=data5
+ pfadd mm1, mm0 ; mm1=data1
+
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+ movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+ add edx, byte 2*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .columnloop
+
+ femms ; empty MMX/3DNow! state
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctflt-sse.asm b/media/libjpeg/simd/i386/jfdctflt-sse.asm
new file mode 100644
index 0000000000..86952c6499
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctflt-sse.asm
@@ -0,0 +1,369 @@
+;
+; jfdctflt.asm - floating-point FDCT (SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+%define data(b) (b) + 8 ; FAST_FLOAT *data
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.rowloop:
+
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+ ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+ movaps xmm4, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
+ unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
+ movaps xmm5, xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
+ unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+ ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
+
+ movaps xmm4, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
+ movaps xmm2, xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
+ unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
+
+ movaps xmm7, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
+ unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
+ movaps xmm3, xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
+ unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
+
+ movaps xmm0, xmm7
+ movaps xmm5, xmm6
+ subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
+ unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
+ movaps xmm6, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
+ unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
+
+ movaps xmm2, xmm7
+ movaps xmm3, xmm4
+ addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1, xmm5
+ movaps xmm6, xmm0
+ subps xmm5, xmm7 ; xmm5=tmp13
+ subps xmm0, xmm4 ; xmm0=tmp12
+ addps xmm1, xmm7 ; xmm1=tmp10
+ addps xmm6, xmm4 ; xmm6=tmp11
+
+ addps xmm0, xmm5
+ mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+ movaps xmm7, xmm1
+ movaps xmm4, xmm5
+ subps xmm1, xmm6 ; xmm1=data4
+ subps xmm5, xmm0 ; xmm5=data6
+ addps xmm7, xmm6 ; xmm7=data0
+ addps xmm4, xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2, xmm3 ; xmm2=tmp10
+ addps xmm3, xmm6 ; xmm3=tmp11
+ addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+ movaps xmm1, xmm2 ; xmm1=tmp10
+ subps xmm2, xmm6
+ mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+ mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1, xmm2 ; xmm1=z2
+ addps xmm6, xmm2 ; xmm6=z4
+
+ movaps xmm5, xmm0
+ subps xmm0, xmm3 ; xmm0=z13
+ addps xmm5, xmm3 ; xmm5=z11
+
+ movaps xmm7, xmm0
+ movaps xmm4, xmm5
+ subps xmm0, xmm1 ; xmm0=data3
+ subps xmm5, xmm6 ; xmm5=data7
+ addps xmm7, xmm1 ; xmm7=data5
+ addps xmm4, xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.columnloop:
+
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+ ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+ movaps xmm4, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
+ unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
+ movaps xmm5, xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
+ unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+ ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
+
+ movaps xmm4, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
+ unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
+ movaps xmm2, xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
+ unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
+
+ movaps xmm7, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
+ unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
+ movaps xmm3, xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
+ unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
+
+ movaps xmm0, xmm7
+ movaps xmm5, xmm6
+ subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
+ unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
+ movaps xmm6, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
+ unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
+
+ movaps xmm2, xmm7
+ movaps xmm3, xmm4
+ addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1, xmm5
+ movaps xmm6, xmm0
+ subps xmm5, xmm7 ; xmm5=tmp13
+ subps xmm0, xmm4 ; xmm0=tmp12
+ addps xmm1, xmm7 ; xmm1=tmp10
+ addps xmm6, xmm4 ; xmm6=tmp11
+
+ addps xmm0, xmm5
+ mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+ movaps xmm7, xmm1
+ movaps xmm4, xmm5
+ subps xmm1, xmm6 ; xmm1=data4
+ subps xmm5, xmm0 ; xmm5=data6
+ addps xmm7, xmm6 ; xmm7=data0
+ addps xmm4, xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2, xmm3 ; xmm2=tmp10
+ addps xmm3, xmm6 ; xmm3=tmp11
+ addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+ movaps xmm1, xmm2 ; xmm1=tmp10
+ subps xmm2, xmm6
+ mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+ mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1, xmm2 ; xmm1=z2
+ addps xmm6, xmm2 ; xmm6=z4
+
+ movaps xmm5, xmm0
+ subps xmm0, xmm3 ; xmm0=z13
+ addps xmm5, xmm3 ; xmm5=z11
+
+ movaps xmm7, xmm0
+ movaps xmm4, xmm5
+ subps xmm0, xmm1 ; xmm0=data3
+ subps xmm5, xmm6 ; xmm5=data7
+ addps xmm7, xmm1 ; xmm7=data5
+ addps xmm4, xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add edx, byte 4*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .columnloop
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctfst-mmx.asm b/media/libjpeg/simd/i386/jfdctfst-mmx.asm
new file mode 100644
index 0000000000..80645a50d7
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctfst-mmx.asm
@@ -0,0 +1,395 @@
+;
+; jfdctfst.asm - fast integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ 98 ; FIX(0.382683433)
+F_0_541 equ 139 ; FIX(0.541196100)
+F_0_707 equ 181 ; FIX(0.707106781)
+F_1_306 equ 334 ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_mmx(DCTELEM *data)
+;
+
+%define data(b) (b) + 8 ; DCTELEM *data
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_ifast_mmx)
+
+EXTN(jsimd_fdct_ifast_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.rowloop:
+
+ movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+ movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+ ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+ ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+ movq mm4, mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0, mm1 ; mm0=(20 30 21 31)
+ punpckhwd mm4, mm1 ; mm4=(22 32 23 33)
+ movq mm5, mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2, mm3 ; mm2=(24 34 25 35)
+ punpckhwd mm5, mm3 ; mm5=(26 36 27 37)
+
+ movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+ ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+ ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
+ movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
+
+ movq mm4, mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6, mm7 ; mm6=(00 10 01 11)
+ punpckhwd mm4, mm7 ; mm4=(02 12 03 13)
+ movq mm2, mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1, mm3 ; mm1=(04 14 05 15)
+ punpckhwd mm2, mm3 ; mm2=(06 16 07 17)
+
+ movq mm7, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0
+ punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1
+ movq mm3, mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6
+ punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7
+
+ movq mm0, mm7
+ movq mm5, mm6
+ psubw mm7, mm2 ; mm7=data1-data6=tmp6
+ psubw mm6, mm3 ; mm6=data0-data7=tmp7
+ paddw mm0, mm2 ; mm0=data1+data6=tmp1
+ paddw mm5, mm3 ; mm5=data0+data7=tmp0
+
+ movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
+ movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
+
+ movq mm7, mm4 ; transpose coefficients(phase 2)
+ punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2
+ punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3
+ movq mm6, mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4
+ punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5
+
+ movq mm2, mm7
+ movq mm3, mm4
+ paddw mm7, mm1 ; mm7=data3+data4=tmp3
+ paddw mm4, mm6 ; mm4=data2+data5=tmp2
+ psubw mm2, mm1 ; mm2=data3-data4=tmp4
+ psubw mm3, mm6 ; mm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm1, mm5
+ movq mm6, mm0
+ psubw mm5, mm7 ; mm5=tmp13
+ psubw mm0, mm4 ; mm0=tmp12
+ paddw mm1, mm7 ; mm1=tmp10
+ paddw mm6, mm4 ; mm6=tmp11
+
+ paddw mm0, mm5
+ psllw mm0, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm0, [GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+ movq mm7, mm1
+ movq mm4, mm5
+ psubw mm1, mm6 ; mm1=data4
+ psubw mm5, mm0 ; mm5=data6
+ paddw mm7, mm6 ; mm7=data0
+ paddw mm4, mm0 ; mm4=data2
+
+ movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+ ; -- Odd part
+
+ movq mm6, MMWORD [wk(0)] ; mm6=tmp6
+ movq mm0, MMWORD [wk(1)] ; mm0=tmp7
+
+ paddw mm2, mm3 ; mm2=tmp10
+ paddw mm3, mm6 ; mm3=tmp11
+ paddw mm6, mm0 ; mm6=tmp12, mm0=tmp7
+
+ psllw mm2, PRE_MULTIPLY_SCALE_BITS
+ psllw mm6, PRE_MULTIPLY_SCALE_BITS
+
+ psllw mm3, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm3, [GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+ movq mm1, mm2 ; mm1=tmp10
+ psubw mm2, mm6
+ pmulhw mm2, [GOTOFF(ebx,PW_F0382)] ; mm2=z5
+ pmulhw mm1, [GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+ pmulhw mm6, [GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+ paddw mm1, mm2 ; mm1=z2
+ paddw mm6, mm2 ; mm6=z4
+
+ movq mm5, mm0
+ psubw mm0, mm3 ; mm0=z13
+ paddw mm5, mm3 ; mm5=z11
+
+ movq mm7, mm0
+ movq mm4, mm5
+ psubw mm0, mm1 ; mm0=data3
+ psubw mm5, mm6 ; mm5=data7
+ paddw mm7, mm1 ; mm7=data5
+ paddw mm4, mm6 ; mm4=data1
+
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+ add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+ movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+ ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+ movq mm4, mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0, mm1 ; mm0=(02 03 12 13)
+ punpckhwd mm4, mm1 ; mm4=(22 23 32 33)
+ movq mm5, mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2, mm3 ; mm2=(42 43 52 53)
+ punpckhwd mm5, mm3 ; mm5=(62 63 72 73)
+
+ movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+ ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+ ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
+ movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
+
+ movq mm4, mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6, mm7 ; mm6=(00 01 10 11)
+ punpckhwd mm4, mm7 ; mm4=(20 21 30 31)
+ movq mm2, mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1, mm3 ; mm1=(40 41 50 51)
+ punpckhwd mm2, mm3 ; mm2=(60 61 70 71)
+
+ movq mm7, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0
+ punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1
+ movq mm3, mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6
+ punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7
+
+ movq mm0, mm7
+ movq mm5, mm6
+ psubw mm7, mm2 ; mm7=data1-data6=tmp6
+ psubw mm6, mm3 ; mm6=data0-data7=tmp7
+ paddw mm0, mm2 ; mm0=data1+data6=tmp1
+ paddw mm5, mm3 ; mm5=data0+data7=tmp0
+
+ movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
+ movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
+
+ movq mm7, mm4 ; transpose coefficients(phase 2)
+ punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2
+ punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3
+ movq mm6, mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4
+ punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5
+
+ movq mm2, mm7
+ movq mm3, mm4
+ paddw mm7, mm1 ; mm7=data3+data4=tmp3
+ paddw mm4, mm6 ; mm4=data2+data5=tmp2
+ psubw mm2, mm1 ; mm2=data3-data4=tmp4
+ psubw mm3, mm6 ; mm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm1, mm5
+ movq mm6, mm0
+ psubw mm5, mm7 ; mm5=tmp13
+ psubw mm0, mm4 ; mm0=tmp12
+ paddw mm1, mm7 ; mm1=tmp10
+ paddw mm6, mm4 ; mm6=tmp11
+
+ paddw mm0, mm5
+ psllw mm0, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm0, [GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+ movq mm7, mm1
+ movq mm4, mm5
+ psubw mm1, mm6 ; mm1=data4
+ psubw mm5, mm0 ; mm5=data6
+ paddw mm7, mm6 ; mm7=data0
+ paddw mm4, mm0 ; mm4=data2
+
+ movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+ ; -- Odd part
+
+ movq mm6, MMWORD [wk(0)] ; mm6=tmp6
+ movq mm0, MMWORD [wk(1)] ; mm0=tmp7
+
+ paddw mm2, mm3 ; mm2=tmp10
+ paddw mm3, mm6 ; mm3=tmp11
+ paddw mm6, mm0 ; mm6=tmp12, mm0=tmp7
+
+ psllw mm2, PRE_MULTIPLY_SCALE_BITS
+ psllw mm6, PRE_MULTIPLY_SCALE_BITS
+
+ psllw mm3, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm3, [GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+ movq mm1, mm2 ; mm1=tmp10
+ psubw mm2, mm6
+ pmulhw mm2, [GOTOFF(ebx,PW_F0382)] ; mm2=z5
+ pmulhw mm1, [GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+ pmulhw mm6, [GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+ paddw mm1, mm2 ; mm1=z2
+ paddw mm6, mm2 ; mm6=z4
+
+ movq mm5, mm0
+ psubw mm0, mm3 ; mm0=z13
+ paddw mm5, mm3 ; mm5=z11
+
+ movq mm7, mm0
+ movq mm4, mm5
+ psubw mm0, mm1 ; mm0=data3
+ psubw mm5, mm6 ; mm5=data7
+ paddw mm7, mm1 ; mm7=data5
+ paddw mm4, mm6 ; mm4=data1
+
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+ add edx, byte 4*SIZEOF_DCTELEM
+ dec ecx
+ jnz near .columnloop
+
+ emms ; empty MMX state
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctfst-sse2.asm b/media/libjpeg/simd/i386/jfdctfst-sse2.asm
new file mode 100644
index 0000000000..446fa7a68f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctfst-sse2.asm
@@ -0,0 +1,403 @@
+;
+; jfdctfst.asm - fast integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ 98 ; FIX(0.382683433)
+F_0_541 equ 139 ; FIX(0.541196100)
+F_0_707 equ 181 ; FIX(0.707106781)
+F_1_306 equ 334 ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+%define data(b) (b) + 8 ; DCTELEM *data
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6, xmm1
+ movdqa xmm3, xmm0
+ psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, xmm7
+ paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm0, xmm6
+ psubw xmm3, xmm1 ; xmm3=tmp13
+ psubw xmm6, xmm7 ; xmm6=tmp12
+ paddw xmm4, xmm1 ; xmm4=tmp10
+ paddw xmm0, xmm7 ; xmm0=tmp11
+
+ paddw xmm6, xmm3
+ psllw xmm6, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm6, [GOTOFF(ebx,PW_F0707)] ; xmm6=z1
+
+ movdqa xmm1, xmm4
+ movdqa xmm7, xmm3
+ psubw xmm4, xmm0 ; xmm4=data4
+ psubw xmm3, xmm6 ; xmm3=data6
+ paddw xmm1, xmm0 ; xmm1=data0
+ paddw xmm7, xmm6 ; xmm7=data2
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
+
+ ; -- Odd part
+
+ paddw xmm2, xmm5 ; xmm2=tmp10
+ paddw xmm5, xmm0 ; xmm5=tmp11
+ paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
+
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z3
+
+ movdqa xmm4, xmm2 ; xmm4=tmp10
+ psubw xmm2, xmm0
+ pmulhw xmm2, [GOTOFF(ebx,PW_F0382)] ; xmm2=z5
+ pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm0, [GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4, xmm2 ; xmm4=z2
+ paddw xmm0, xmm2 ; xmm0=z4
+
+ movdqa xmm3, xmm6
+ psubw xmm6, xmm5 ; xmm6=z13
+ paddw xmm3, xmm5 ; xmm3=z11
+
+ movdqa xmm2, xmm6
+ movdqa xmm5, xmm3
+ psubw xmm6, xmm4 ; xmm6=data3
+ psubw xmm3, xmm0 ; xmm3=data7
+ paddw xmm2, xmm4 ; xmm2=data5
+ paddw xmm5, xmm0 ; xmm5=data1
+
+ ; ---- Pass 2: process columns.
+
+; mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+ ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
+
+ ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+ ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
+ movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
+ punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
+ movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm5, xmm6
+ movdqa xmm3, xmm1
+ psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
+ psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
+ paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
+ paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
+
+ movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm7, xmm6
+ movdqa xmm0, xmm2
+ paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
+ paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
+ psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
+ psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm1, xmm5
+ psubw xmm3, xmm6 ; xmm3=tmp13
+ psubw xmm5, xmm2 ; xmm5=tmp12
+ paddw xmm4, xmm6 ; xmm4=tmp10
+ paddw xmm1, xmm2 ; xmm1=tmp11
+
+ paddw xmm5, xmm3
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z1
+
+ movdqa xmm6, xmm4
+ movdqa xmm2, xmm3
+ psubw xmm4, xmm1 ; xmm4=data4
+ psubw xmm3, xmm5 ; xmm3=data6
+ paddw xmm6, xmm1 ; xmm6=data0
+ paddw xmm2, xmm5 ; xmm2=data2
+
+ movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+ ; -- Odd part
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ paddw xmm7, xmm0 ; xmm7=tmp10
+ paddw xmm0, xmm1 ; xmm0=tmp11
+ paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
+
+ psllw xmm7, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm1, PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm0, [GOTOFF(ebx,PW_F0707)] ; xmm0=z3
+
+ movdqa xmm4, xmm7 ; xmm4=tmp10
+ psubw xmm7, xmm1
+ pmulhw xmm7, [GOTOFF(ebx,PW_F0382)] ; xmm7=z5
+ pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm1, [GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4, xmm7 ; xmm4=z2
+ paddw xmm1, xmm7 ; xmm1=z4
+
+ movdqa xmm3, xmm5
+ psubw xmm5, xmm0 ; xmm5=z13
+ paddw xmm3, xmm0 ; xmm3=z11
+
+ movdqa xmm6, xmm5
+ movdqa xmm2, xmm3
+ psubw xmm5, xmm4 ; xmm5=data3
+ psubw xmm3, xmm1 ; xmm3=data7
+ paddw xmm6, xmm4 ; xmm6=data5
+ paddw xmm2, xmm1 ; xmm2=data1
+
+ movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+ movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctint-avx2.asm b/media/libjpeg/simd/i386/jfdctint-avx2.asm
new file mode 100644
index 0000000000..23cf733135
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-avx2.asm
@@ -0,0 +1,331 @@
+;
+; jfdctint.asm - accurate integer FDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+ ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
+ ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
+ ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
+ ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
+
+ vpunpcklwd %5, %1, %2
+ vpunpckhwd %6, %1, %2
+ vpunpcklwd %7, %3, %4
+ vpunpckhwd %8, %3, %4
+ ; transpose coefficients(phase 1)
+ ; %5=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53)
+ ; %6=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57)
+ ; %7=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73)
+ ; %8=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77)
+
+ vpunpckldq %1, %5, %7
+ vpunpckhdq %2, %5, %7
+ vpunpckldq %3, %6, %8
+ vpunpckhdq %4, %6, %8
+ ; transpose coefficients(phase 2)
+ ; %1=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
+ ; %2=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
+ ; %3=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
+ ; %4=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
+
+ vpermq %1, %1, 0x8D
+ vpermq %2, %2, 0x8D
+ vpermq %3, %3, 0xD8
+ vpermq %4, %4, 0xD8
+ ; transpose coefficients(phase 3)
+ ; %1=(01 11 21 31 41 51 61 71 00 10 20 30 40 50 60 70)
+ ; %2=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
+ ; %3=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
+ ; %4=(06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9: Pass (1 or 2)
+
+%macro dodct 9
+ vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7
+ vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0
+ vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2
+ vpsubw %8, %2, %3 ; %8=data3_2-data4_5=tmp4_5
+
+ ; -- Even part
+
+ vperm2i128 %6, %6, %6, 0x01 ; %6=tmp0_1
+ vpaddw %1, %6, %7 ; %1=tmp0_1+tmp3_2=tmp10_11
+ vpsubw %6, %6, %7 ; %6=tmp0_1-tmp3_2=tmp13_12
+
+ vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10
+ vpsignw %1, %1, [GOTOFF(ebx, PW_1_NEG1)] ; %1=tmp10_neg11
+ vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+ vpsllw %1, %7, PASS1_BITS ; %1=data0_4
+%else
+ vpaddw %7, %7, [GOTOFF(ebx, PW_DESCALE_P2X)]
+ vpsraw %1, %7, PASS1_BITS ; %1=data0_4
+%endif
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ vperm2i128 %7, %6, %6, 0x01 ; %7=tmp12_13
+ vpunpcklwd %2, %6, %7
+ vpunpckhwd %6, %6, %7
+ vpmaddwd %2, %2, [GOTOFF(ebx, PW_F130_F054_MF130_F054)] ; %2=data2_6L
+ vpmaddwd %6, %6, [GOTOFF(ebx, PW_F130_F054_MF130_F054)] ; %6=data2_6H
+
+ vpaddd %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+ vpaddd %6, %6, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+ vpsrad %2, %2, DESCALE_P %+ %9
+ vpsrad %6, %6, DESCALE_P %+ %9
+
+ vpackssdw %3, %2, %6 ; %6=data2_6
+
+ ; -- Odd part
+
+ vpaddw %7, %8, %5 ; %7=tmp4_5+tmp6_7=z3_4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ vperm2i128 %2, %7, %7, 0x01 ; %2=z4_3
+ vpunpcklwd %6, %7, %2
+ vpunpckhwd %7, %7, %2
+ vpmaddwd %6, %6, [GOTOFF(ebx, PW_MF078_F117_F078_F117)] ; %6=z3_4L
+ vpmaddwd %7, %7, [GOTOFF(ebx, PW_MF078_F117_F078_F117)] ; %7=z3_4H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ vperm2i128 %4, %5, %5, 0x01 ; %4=tmp7_6
+ vpunpcklwd %2, %8, %4
+ vpunpckhwd %4, %8, %4
+ vpmaddwd %2, %2, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)] ; %2=tmp4_5L
+ vpmaddwd %4, %4, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)] ; %4=tmp4_5H
+
+ vpaddd %2, %2, %6 ; %2=data7_5L
+ vpaddd %4, %4, %7 ; %4=data7_5H
+
+ vpaddd %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+ vpaddd %4, %4, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+ vpsrad %2, %2, DESCALE_P %+ %9
+ vpsrad %4, %4, DESCALE_P %+ %9
+
+ vpackssdw %4, %2, %4 ; %4=data7_5
+
+ vperm2i128 %2, %8, %8, 0x01 ; %2=tmp5_4
+ vpunpcklwd %8, %5, %2
+ vpunpckhwd %5, %5, %2
+ vpmaddwd %8, %8, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)] ; %8=tmp6_7L
+ vpmaddwd %5, %5, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)] ; %5=tmp6_7H
+
+ vpaddd %8, %8, %6 ; %8=data3_1L
+ vpaddd %5, %5, %7 ; %5=data3_1H
+
+ vpaddd %8, %8, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+ vpaddd %5, %5, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+ vpsrad %8, %8, DESCALE_P %+ %9
+ vpsrad %5, %5, DESCALE_P %+ %9
+
+ vpackssdw %2, %8, %5 ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+ times 4 dw (F_0_541 - F_1_847), F_0_541
+PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+ times 4 dw (F_1_175 - F_0_390), F_1_175
+PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
+ times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089 times 4 dw (F_3_072 - F_2_562), -F_2_562
+ times 4 dw (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1)
+PW_1_NEG1 times 8 dw 1
+ times 8 dw -1
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+%define data(b) (b) + 8 ; DCTELEM *data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+ push ebp
+ mov ebp, esp
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(ebp)] ; (DCTELEM *)
+
+ vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ ; ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ ; ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ ; ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ ; ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ vperm2i128 ymm0, ymm4, ymm6, 0x20
+ vperm2i128 ymm1, ymm4, ymm6, 0x31
+ vperm2i128 ymm2, ymm5, ymm7, 0x20
+ vperm2i128 ymm3, ymm5, ymm7, 0x31
+ ; ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
+ ; ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
+ ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
+ ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
+
+ dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+ dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+ ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+ ; ---- Pass 2: process columns.
+
+ vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7
+ vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5
+
+ dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+ dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+ ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+ vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1
+ vperm2i128 ymm5, ymm2, ymm1, 0x20 ; ymm5=data2_3
+ vperm2i128 ymm6, ymm0, ymm4, 0x31 ; ymm6=data4_5
+ vperm2i128 ymm7, ymm2, ymm4, 0x21 ; ymm7=data6_7
+
+ vmovdqu YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], ymm3
+ vmovdqu YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], ymm5
+ vmovdqu YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], ymm6
+ vmovdqu YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], ymm7
+
+ vzeroupper
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctint-mmx.asm b/media/libjpeg/simd/i386/jfdctint-mmx.asm
new file mode 100644
index 0000000000..34a43b9e5e
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-mmx.asm
@@ -0,0 +1,620 @@
+;
+; jfdctint.asm - accurate integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541
+PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175
+PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_mmx(DCTELEM *data)
+;
+
+%define data(b) (b) + 8 ; DCTELEM *data
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_islow_mmx)
+
+EXTN(jsimd_fdct_islow_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.rowloop:
+
+ movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+ movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+ ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+ ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+ movq mm4, mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0, mm1 ; mm0=(20 30 21 31)
+ punpckhwd mm4, mm1 ; mm4=(22 32 23 33)
+ movq mm5, mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2, mm3 ; mm2=(24 34 25 35)
+ punpckhwd mm5, mm3 ; mm5=(26 36 27 37)
+
+ movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+ ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+ ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
+ movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
+
+ movq mm4, mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6, mm7 ; mm6=(00 10 01 11)
+ punpckhwd mm4, mm7 ; mm4=(02 12 03 13)
+ movq mm2, mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1, mm3 ; mm1=(04 14 05 15)
+ punpckhwd mm2, mm3 ; mm2=(06 16 07 17)
+
+ movq mm7, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0
+ punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1
+ movq mm3, mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6
+ punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7
+
+ movq mm0, mm7
+ movq mm5, mm6
+ psubw mm7, mm2 ; mm7=data1-data6=tmp6
+ psubw mm6, mm3 ; mm6=data0-data7=tmp7
+ paddw mm0, mm2 ; mm0=data1+data6=tmp1
+ paddw mm5, mm3 ; mm5=data0+data7=tmp0
+
+ movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
+ movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
+
+ movq mm7, mm4 ; transpose coefficients(phase 2)
+ punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2
+ punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3
+ movq mm6, mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4
+ punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5
+
+ movq mm2, mm7
+ movq mm3, mm4
+ paddw mm7, mm1 ; mm7=data3+data4=tmp3
+ paddw mm4, mm6 ; mm4=data2+data5=tmp2
+ psubw mm2, mm1 ; mm2=data3-data4=tmp4
+ psubw mm3, mm6 ; mm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm1, mm5
+ movq mm6, mm0
+ paddw mm5, mm7 ; mm5=tmp10
+ paddw mm0, mm4 ; mm0=tmp11
+ psubw mm1, mm7 ; mm1=tmp13
+ psubw mm6, mm4 ; mm6=tmp12
+
+ movq mm7, mm5
+ paddw mm5, mm0 ; mm5=tmp10+tmp11
+ psubw mm7, mm0 ; mm7=tmp10-tmp11
+
+ psllw mm5, PASS1_BITS ; mm5=data0
+ psllw mm7, PASS1_BITS ; mm7=data4
+
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movq mm4, mm1 ; mm1=tmp13
+ movq mm0, mm1
+ punpcklwd mm4, mm6 ; mm6=tmp12
+ punpckhwd mm0, mm6
+ movq mm1, mm4
+ movq mm6, mm0
+ pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
+ pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
+ pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
+ pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
+
+ paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm4, DESCALE_P1
+ psrad mm0, DESCALE_P1
+ paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm1, DESCALE_P1
+ psrad mm6, DESCALE_P1
+
+ packssdw mm4, mm0 ; mm4=data2
+ packssdw mm1, mm6 ; mm1=data6
+
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+ ; -- Odd part
+
+ movq mm5, MMWORD [wk(0)] ; mm5=tmp6
+ movq mm7, MMWORD [wk(1)] ; mm7=tmp7
+
+ movq mm0, mm2 ; mm2=tmp4
+ movq mm6, mm3 ; mm3=tmp5
+ paddw mm0, mm5 ; mm0=z3
+ paddw mm6, mm7 ; mm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movq mm4, mm0
+ movq mm1, mm0
+ punpcklwd mm4, mm6
+ punpckhwd mm1, mm6
+ movq mm0, mm4
+ movq mm6, mm1
+ pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
+ pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
+ pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
+ movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movq mm4, mm2
+ movq mm1, mm2
+ punpcklwd mm4, mm7
+ punpckhwd mm1, mm7
+ movq mm2, mm4
+ movq mm7, mm1
+ pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
+ pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
+
+ paddd mm4, MMWORD [wk(0)] ; mm4=data7L
+ paddd mm1, MMWORD [wk(1)] ; mm1=data7H
+ paddd mm2, mm0 ; mm2=data1L
+ paddd mm7, mm6 ; mm7=data1H
+
+ paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm4, DESCALE_P1
+ psrad mm1, DESCALE_P1
+ paddd mm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm2, DESCALE_P1
+ psrad mm7, DESCALE_P1
+
+ packssdw mm4, mm1 ; mm4=data7
+ packssdw mm2, mm7 ; mm2=data1
+
+ movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+ movq mm1, mm3
+ movq mm7, mm3
+ punpcklwd mm1, mm5
+ punpckhwd mm7, mm5
+ movq mm3, mm1
+ movq mm5, mm7
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
+ pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
+ pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
+ pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
+
+ paddd mm1, mm0 ; mm1=data5L
+ paddd mm7, mm6 ; mm7=data5H
+ paddd mm3, MMWORD [wk(0)] ; mm3=data3L
+ paddd mm5, MMWORD [wk(1)] ; mm5=data3H
+
+ paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm1, DESCALE_P1
+ psrad mm7, DESCALE_P1
+ paddd mm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm3, DESCALE_P1
+ psrad mm5, DESCALE_P1
+
+ packssdw mm1, mm7 ; mm1=data5
+ packssdw mm3, mm5 ; mm3=data3
+
+ movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+ add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.columnloop:
+
+ movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+ movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+ ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+ movq mm4, mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0, mm1 ; mm0=(02 03 12 13)
+ punpckhwd mm4, mm1 ; mm4=(22 23 32 33)
+ movq mm5, mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2, mm3 ; mm2=(42 43 52 53)
+ punpckhwd mm5, mm3 ; mm5=(62 63 72 73)
+
+ movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+ ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+ ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
+ movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
+
+ movq mm4, mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6, mm7 ; mm6=(00 01 10 11)
+ punpckhwd mm4, mm7 ; mm4=(20 21 30 31)
+ movq mm2, mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1, mm3 ; mm1=(40 41 50 51)
+ punpckhwd mm2, mm3 ; mm2=(60 61 70 71)
+
+ movq mm7, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0
+ punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1
+ movq mm3, mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6
+ punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7
+
+ movq mm0, mm7
+ movq mm5, mm6
+ psubw mm7, mm2 ; mm7=data1-data6=tmp6
+ psubw mm6, mm3 ; mm6=data0-data7=tmp7
+ paddw mm0, mm2 ; mm0=data1+data6=tmp1
+ paddw mm5, mm3 ; mm5=data0+data7=tmp0
+
+ movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
+ movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
+
+ movq mm7, mm4 ; transpose coefficients(phase 2)
+ punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2
+ punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3
+ movq mm6, mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4
+ punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5
+
+ movq mm2, mm7
+ movq mm3, mm4
+ paddw mm7, mm1 ; mm7=data3+data4=tmp3
+ paddw mm4, mm6 ; mm4=data2+data5=tmp2
+ psubw mm2, mm1 ; mm2=data3-data4=tmp4
+ psubw mm3, mm6 ; mm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm1, mm5
+ movq mm6, mm0
+ paddw mm5, mm7 ; mm5=tmp10
+ paddw mm0, mm4 ; mm0=tmp11
+ psubw mm1, mm7 ; mm1=tmp13
+ psubw mm6, mm4 ; mm6=tmp12
+
+ movq mm7, mm5
+ paddw mm5, mm0 ; mm5=tmp10+tmp11
+ psubw mm7, mm0 ; mm7=tmp10-tmp11
+
+ paddw mm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+ paddw mm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+ psraw mm5, PASS1_BITS ; mm5=data0
+ psraw mm7, PASS1_BITS ; mm7=data4
+
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movq mm4, mm1 ; mm1=tmp13
+ movq mm0, mm1
+ punpcklwd mm4, mm6 ; mm6=tmp12
+ punpckhwd mm0, mm6
+ movq mm1, mm4
+ movq mm6, mm0
+ pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
+ pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
+ pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
+ pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
+
+ paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm4, DESCALE_P2
+ psrad mm0, DESCALE_P2
+ paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm1, DESCALE_P2
+ psrad mm6, DESCALE_P2
+
+ packssdw mm4, mm0 ; mm4=data2
+ packssdw mm1, mm6 ; mm1=data6
+
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+ ; -- Odd part
+
+ movq mm5, MMWORD [wk(0)] ; mm5=tmp6
+ movq mm7, MMWORD [wk(1)] ; mm7=tmp7
+
+ movq mm0, mm2 ; mm2=tmp4
+ movq mm6, mm3 ; mm3=tmp5
+ paddw mm0, mm5 ; mm0=z3
+ paddw mm6, mm7 ; mm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movq mm4, mm0
+ movq mm1, mm0
+ punpcklwd mm4, mm6
+ punpckhwd mm1, mm6
+ movq mm0, mm4
+ movq mm6, mm1
+ pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
+ pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
+ pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
+ movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movq mm4, mm2
+ movq mm1, mm2
+ punpcklwd mm4, mm7
+ punpckhwd mm1, mm7
+ movq mm2, mm4
+ movq mm7, mm1
+ pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
+ pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
+
+ paddd mm4, MMWORD [wk(0)] ; mm4=data7L
+ paddd mm1, MMWORD [wk(1)] ; mm1=data7H
+ paddd mm2, mm0 ; mm2=data1L
+ paddd mm7, mm6 ; mm7=data1H
+
+ paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm4, DESCALE_P2
+ psrad mm1, DESCALE_P2
+ paddd mm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm2, DESCALE_P2
+ psrad mm7, DESCALE_P2
+
+ packssdw mm4, mm1 ; mm4=data7
+ packssdw mm2, mm7 ; mm2=data1
+
+ movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+ movq mm1, mm3
+ movq mm7, mm3
+ punpcklwd mm1, mm5
+ punpckhwd mm7, mm5
+ movq mm3, mm1
+ movq mm5, mm7
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
+ pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
+ pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
+ pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
+
+ paddd mm1, mm0 ; mm1=data5L
+ paddd mm7, mm6 ; mm7=data5H
+ paddd mm3, MMWORD [wk(0)] ; mm3=data3L
+ paddd mm5, MMWORD [wk(1)] ; mm5=data3H
+
+ paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm1, DESCALE_P2
+ psrad mm7, DESCALE_P2
+ paddd mm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm3, DESCALE_P2
+ psrad mm5, DESCALE_P2
+
+ packssdw mm1, mm7 ; mm1=data5
+ packssdw mm3, mm5 ; mm3=data3
+
+ movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+ add edx, byte 4*SIZEOF_DCTELEM
+ dec ecx
+ jnz near .columnloop
+
+ emms ; empty MMX state
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jfdctint-sse2.asm b/media/libjpeg/simd/i386/jfdctint-sse2.asm
new file mode 100644
index 0000000000..6f8e18cb9d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-sse2.asm
@@ -0,0 +1,633 @@
+;
+; jfdctint.asm - accurate integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+%define data(b) (b) + 8 ; DCTELEM *data
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 6
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6, xmm1
+ movdqa xmm3, xmm0
+ psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, xmm7
+ paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm0, xmm6
+ paddw xmm3, xmm1 ; xmm3=tmp10
+ paddw xmm6, xmm7 ; xmm6=tmp11
+ psubw xmm4, xmm1 ; xmm4=tmp13
+ psubw xmm0, xmm7 ; xmm0=tmp12
+
+ movdqa xmm1, xmm3
+ paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
+ psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
+
+ psllw xmm3, PASS1_BITS ; xmm3=data0
+ psllw xmm1, PASS1_BITS ; xmm1=data4
+
+ movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
+ movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm7, xmm4 ; xmm4=tmp13
+ movdqa xmm6, xmm4
+ punpcklwd xmm7, xmm0 ; xmm0=tmp12
+ punpckhwd xmm6, xmm0
+ movdqa xmm4, xmm7
+ movdqa xmm0, xmm6
+ pmaddwd xmm7, [GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H
+
+ paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm7, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm4, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm7, xmm6 ; xmm7=data2
+ packssdw xmm4, xmm0 ; xmm4=data6
+
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
+
+ ; -- Odd part
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
+
+ movdqa xmm6, xmm2 ; xmm2=tmp4
+ movdqa xmm0, xmm5 ; xmm5=tmp5
+ paddw xmm6, xmm3 ; xmm6=z3
+ paddw xmm0, xmm1 ; xmm0=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm7, xmm6
+ movdqa xmm4, xmm6
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm4, xmm0
+ movdqa xmm6, xmm7
+ movdqa xmm0, xmm4
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ punpcklwd xmm7, xmm1
+ punpckhwd xmm4, xmm1
+ movdqa xmm2, xmm7
+ movdqa xmm1, xmm4
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H
+
+ paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
+ paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
+ paddd xmm2, xmm6 ; xmm2=data1L
+ paddd xmm1, xmm0 ; xmm1=data1H
+
+ paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm7, DESCALE_P1
+ psrad xmm4, DESCALE_P1
+ paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm2, DESCALE_P1
+ psrad xmm1, DESCALE_P1
+
+ packssdw xmm7, xmm4 ; xmm7=data7
+ packssdw xmm2, xmm1 ; xmm2=data1
+
+ movdqa xmm4, xmm5
+ movdqa xmm1, xmm5
+ punpcklwd xmm4, xmm3
+ punpckhwd xmm1, xmm3
+ movdqa xmm5, xmm4
+ movdqa xmm3, xmm1
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H
+ pmaddwd xmm5, [GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L
+ pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H
+
+ paddd xmm4, xmm6 ; xmm4=data5L
+ paddd xmm1, xmm0 ; xmm1=data5H
+ paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
+ paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
+
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm4, DESCALE_P1
+ psrad xmm1, DESCALE_P1
+ paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm5, DESCALE_P1
+ psrad xmm3, DESCALE_P1
+
+ packssdw xmm4, xmm1 ; xmm4=data5
+ packssdw xmm5, xmm3 ; xmm5=data3
+
+ ; ---- Pass 2: process columns.
+
+; mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
+ movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
+
+ ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+ ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
+ movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
+
+ ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+ ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
+ movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm2, xmm5
+ movdqa xmm7, xmm6
+ psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
+ psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
+ paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
+
+ movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm0, xmm5
+ movdqa xmm3, xmm4
+ paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
+ paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
+ psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm1, xmm7
+ movdqa xmm6, xmm2
+ paddw xmm7, xmm5 ; xmm7=tmp10
+ paddw xmm2, xmm4 ; xmm2=tmp11
+ psubw xmm1, xmm5 ; xmm1=tmp13
+ psubw xmm6, xmm4 ; xmm6=tmp12
+
+ movdqa xmm5, xmm7
+ paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
+ psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
+
+ paddw xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+ paddw xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+ psraw xmm7, PASS1_BITS ; xmm7=data0
+ psraw xmm5, PASS1_BITS ; xmm5=data4
+
+ movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+ movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm4, xmm1 ; xmm1=tmp13
+ movdqa xmm2, xmm1
+ punpcklwd xmm4, xmm6 ; xmm6=tmp12
+ punpckhwd xmm2, xmm6
+ movdqa xmm1, xmm4
+ movdqa xmm6, xmm2
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H
+
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm4, DESCALE_P2
+ psrad xmm2, DESCALE_P2
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm1, DESCALE_P2
+ psrad xmm6, DESCALE_P2
+
+ packssdw xmm4, xmm2 ; xmm4=data2
+ packssdw xmm1, xmm6 ; xmm1=data6
+
+ movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+ ; -- Odd part
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ movdqa xmm2, xmm0 ; xmm0=tmp4
+ movdqa xmm6, xmm3 ; xmm3=tmp5
+ paddw xmm2, xmm7 ; xmm2=z3
+ paddw xmm6, xmm5 ; xmm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm4, xmm2
+ movdqa xmm1, xmm2
+ punpcklwd xmm4, xmm6
+ punpckhwd xmm1, xmm6
+ movdqa xmm2, xmm4
+ movdqa xmm6, xmm1
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm4, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm4, xmm5
+ punpckhwd xmm1, xmm5
+ movdqa xmm0, xmm4
+ movdqa xmm5, xmm1
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L
+ pmaddwd xmm5, [GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H
+
+ paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
+ paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
+ paddd xmm0, xmm2 ; xmm0=data1L
+ paddd xmm5, xmm6 ; xmm5=data1H
+
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm4, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm0, DESCALE_P2
+ psrad xmm5, DESCALE_P2
+
+ packssdw xmm4, xmm1 ; xmm4=data7
+ packssdw xmm0, xmm5 ; xmm0=data1
+
+ movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+ movdqa xmm1, xmm3
+ movdqa xmm5, xmm3
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm5, xmm7
+ movdqa xmm3, xmm1
+ movdqa xmm7, xmm5
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L
+ pmaddwd xmm5, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H
+ pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H
+
+ paddd xmm1, xmm2 ; xmm1=data5L
+ paddd xmm5, xmm6 ; xmm5=data5H
+ paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
+ paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
+
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm1, DESCALE_P2
+ psrad xmm5, DESCALE_P2
+ paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm3, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm1, xmm5 ; xmm1=data5
+ packssdw xmm3, xmm7 ; xmm3=data3
+
+ movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctflt-3dn.asm b/media/libjpeg/simd/i386/jidctflt-3dn.asm
new file mode 100644
index 0000000000..87951910d8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-3dn.asm
@@ -0,0 +1,451 @@
+;
+; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414 times 2 dd 1.414213562373095048801689
+PD_1_847 times 2 dd 1.847759065022573512256366
+PD_1_082 times 2 dd 1.082392200292393968799446
+PD_2_613 times 2 dd 2.613125929752753055713286
+PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+ ; FAST_FLOAT workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_float_3dnow)
+
+EXTN(jsimd_idct_float_3dnow):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; FAST_FLOAT *wsptr
+ mov ecx, DCTSIZE/2 ; ctr
+ alignx 16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ pushpic ebx ; save GOT address
+ mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ or eax, ebx
+ poppic ebx ; restore GOT address
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd mm0, mm0
+ psrad mm0, (DWORD_BIT-WORD_BIT)
+ pi2fd mm0, mm0
+
+ pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movq mm1, mm0
+ punpckldq mm0, mm0
+ punpckhdq mm1, mm1
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+ movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+ movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+ jmp near .nextcolumn
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movd mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movd mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movd mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd mm0, mm0
+ punpcklwd mm1, mm1
+ psrad mm0, (DWORD_BIT-WORD_BIT)
+ psrad mm1, (DWORD_BIT-WORD_BIT)
+ pi2fd mm0, mm0
+ pi2fd mm1, mm1
+
+ pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ punpcklwd mm2, mm2
+ punpcklwd mm3, mm3
+ psrad mm2, (DWORD_BIT-WORD_BIT)
+ psrad mm3, (DWORD_BIT-WORD_BIT)
+ pi2fd mm2, mm2
+ pi2fd mm3, mm3
+
+ pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movq mm4, mm0
+ movq mm5, mm1
+ pfsub mm0, mm2 ; mm0=tmp11
+ pfsub mm1, mm3
+ pfadd mm4, mm2 ; mm4=tmp10
+ pfadd mm5, mm3 ; mm5=tmp13
+
+ pfmul mm1, [GOTOFF(ebx,PD_1_414)]
+ pfsub mm1, mm5 ; mm1=tmp12
+
+ movq mm6, mm4
+ movq mm7, mm0
+ pfsub mm4, mm5 ; mm4=tmp3
+ pfsub mm0, mm1 ; mm0=tmp2
+ pfadd mm6, mm5 ; mm6=tmp0
+ pfadd mm7, mm1 ; mm7=tmp1
+
+ movq MMWORD [wk(1)], mm4 ; tmp3
+ movq MMWORD [wk(0)], mm0 ; tmp2
+
+ ; -- Odd part
+
+ movd mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movd mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movd mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movd mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd mm2, mm2
+ punpcklwd mm3, mm3
+ psrad mm2, (DWORD_BIT-WORD_BIT)
+ psrad mm3, (DWORD_BIT-WORD_BIT)
+ pi2fd mm2, mm2
+ pi2fd mm3, mm3
+
+ pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ punpcklwd mm5, mm5
+ punpcklwd mm1, mm1
+ psrad mm5, (DWORD_BIT-WORD_BIT)
+ psrad mm1, (DWORD_BIT-WORD_BIT)
+ pi2fd mm5, mm5
+ pi2fd mm1, mm1
+
+ pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movq mm4, mm2
+ movq mm0, mm5
+ pfadd mm2, mm1 ; mm2=z11
+ pfadd mm5, mm3 ; mm5=z13
+ pfsub mm4, mm1 ; mm4=z12
+ pfsub mm0, mm3 ; mm0=z10
+
+ movq mm1, mm2
+ pfsub mm2, mm5
+ pfadd mm1, mm5 ; mm1=tmp7
+
+ pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11
+
+ movq mm3, mm0
+ pfadd mm0, mm4
+ pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5
+ pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930)
+ pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200)
+ pfsubr mm3, mm0 ; mm3=tmp12
+ pfsub mm4, mm0 ; mm4=tmp10
+
+ ; -- Final output stage
+
+ pfsub mm3, mm1 ; mm3=tmp6
+ movq mm5, mm6
+ movq mm0, mm7
+ pfadd mm6, mm1 ; mm6=data0=(00 01)
+ pfadd mm7, mm3 ; mm7=data1=(10 11)
+ pfsub mm5, mm1 ; mm5=data7=(70 71)
+ pfsub mm0, mm3 ; mm0=data6=(60 61)
+ pfsub mm2, mm3 ; mm2=tmp5
+
+ movq mm1, mm6 ; transpose coefficients
+ punpckldq mm6, mm7 ; mm6=(00 10)
+ punpckhdq mm1, mm7 ; mm1=(01 11)
+ movq mm3, mm0 ; transpose coefficients
+ punpckldq mm0, mm5 ; mm0=(60 70)
+ punpckhdq mm3, mm5 ; mm3=(61 71)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+ movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+ movq mm7, MMWORD [wk(0)] ; mm7=tmp2
+ movq mm5, MMWORD [wk(1)] ; mm5=tmp3
+
+ pfadd mm4, mm2 ; mm4=tmp4
+ movq mm6, mm7
+ movq mm1, mm5
+ pfadd mm7, mm2 ; mm7=data2=(20 21)
+ pfadd mm5, mm4 ; mm5=data4=(40 41)
+ pfsub mm6, mm2 ; mm6=data5=(50 51)
+ pfsub mm1, mm4 ; mm1=data3=(30 31)
+
+ movq mm0, mm7 ; transpose coefficients
+ punpckldq mm7, mm1 ; mm7=(20 30)
+ punpckhdq mm0, mm1 ; mm0=(21 31)
+ movq mm3, mm5 ; transpose coefficients
+ punpckldq mm5, mm6 ; mm5=(40 50)
+ punpckhdq mm3, mm6 ; mm3=(41 51)
+
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+ movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+ add esi, byte 2*SIZEOF_JCOEF ; coef_block
+ add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; FAST_FLOAT *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/2 ; ctr
+ alignx 16, 7
+.rowloop:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movq mm4, mm0
+ movq mm5, mm1
+ pfsub mm0, mm2 ; mm0=tmp11
+ pfsub mm1, mm3
+ pfadd mm4, mm2 ; mm4=tmp10
+ pfadd mm5, mm3 ; mm5=tmp13
+
+ pfmul mm1, [GOTOFF(ebx,PD_1_414)]
+ pfsub mm1, mm5 ; mm1=tmp12
+
+ movq mm6, mm4
+ movq mm7, mm0
+ pfsub mm4, mm5 ; mm4=tmp3
+ pfsub mm0, mm1 ; mm0=tmp2
+ pfadd mm6, mm5 ; mm6=tmp0
+ pfadd mm7, mm1 ; mm7=tmp1
+
+ movq MMWORD [wk(1)], mm4 ; tmp3
+ movq MMWORD [wk(0)], mm0 ; tmp2
+
+ ; -- Odd part
+
+ movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movq mm4, mm2
+ movq mm0, mm5
+ pfadd mm2, mm1 ; mm2=z11
+ pfadd mm5, mm3 ; mm5=z13
+ pfsub mm4, mm1 ; mm4=z12
+ pfsub mm0, mm3 ; mm0=z10
+
+ movq mm1, mm2
+ pfsub mm2, mm5
+ pfadd mm1, mm5 ; mm1=tmp7
+
+ pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11
+
+ movq mm3, mm0
+ pfadd mm0, mm4
+ pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5
+ pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930)
+ pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200)
+ pfsubr mm3, mm0 ; mm3=tmp12
+ pfsub mm4, mm0 ; mm4=tmp10
+
+ ; -- Final output stage
+
+ pfsub mm3, mm1 ; mm3=tmp6
+ movq mm5, mm6
+ movq mm0, mm7
+ pfadd mm6, mm1 ; mm6=data0=(00 10)
+ pfadd mm7, mm3 ; mm7=data1=(01 11)
+ pfsub mm5, mm1 ; mm5=data7=(07 17)
+ pfsub mm0, mm3 ; mm0=data6=(06 16)
+ pfsub mm2, mm3 ; mm2=tmp5
+
+ movq mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC]
+ pcmpeqd mm3, mm3
+ psrld mm3, WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+ pfadd mm6, mm1 ; mm6=roundint(data0/8)=(00 ** 10 **)
+ pfadd mm7, mm1 ; mm7=roundint(data1/8)=(01 ** 11 **)
+ pfadd mm0, mm1 ; mm0=roundint(data6/8)=(06 ** 16 **)
+ pfadd mm5, mm1 ; mm5=roundint(data7/8)=(07 ** 17 **)
+
+ pand mm6, mm3 ; mm6=(00 -- 10 --)
+ pslld mm7, WORD_BIT ; mm7=(-- 01 -- 11)
+ pand mm0, mm3 ; mm0=(06 -- 16 --)
+ pslld mm5, WORD_BIT ; mm5=(-- 07 -- 17)
+ por mm6, mm7 ; mm6=(00 01 10 11)
+ por mm0, mm5 ; mm0=(06 07 16 17)
+
+ movq mm1, MMWORD [wk(0)] ; mm1=tmp2
+ movq mm3, MMWORD [wk(1)] ; mm3=tmp3
+
+ pfadd mm4, mm2 ; mm4=tmp4
+ movq mm7, mm1
+ movq mm5, mm3
+ pfadd mm1, mm2 ; mm1=data2=(02 12)
+ pfadd mm3, mm4 ; mm3=data4=(04 14)
+ pfsub mm7, mm2 ; mm7=data5=(05 15)
+ pfsub mm5, mm4 ; mm5=data3=(03 13)
+
+ movq mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC]
+ pcmpeqd mm4, mm4
+ psrld mm4, WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+ pfadd mm3, mm2 ; mm3=roundint(data4/8)=(04 ** 14 **)
+ pfadd mm7, mm2 ; mm7=roundint(data5/8)=(05 ** 15 **)
+ pfadd mm1, mm2 ; mm1=roundint(data2/8)=(02 ** 12 **)
+ pfadd mm5, mm2 ; mm5=roundint(data3/8)=(03 ** 13 **)
+
+ pand mm3, mm4 ; mm3=(04 -- 14 --)
+ pslld mm7, WORD_BIT ; mm7=(-- 05 -- 15)
+ pand mm1, mm4 ; mm1=(02 -- 12 --)
+ pslld mm5, WORD_BIT ; mm5=(-- 03 -- 13)
+ por mm3, mm7 ; mm3=(04 05 14 15)
+ por mm1, mm5 ; mm1=(02 03 12 13)
+
+ movq mm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP]
+
+ packsswb mm6, mm3 ; mm6=(00 01 10 11 04 05 14 15)
+ packsswb mm1, mm0 ; mm1=(02 03 12 13 06 07 16 17)
+ paddb mm6, mm2
+ paddb mm1, mm2
+
+ movq mm4, mm6 ; transpose coefficients(phase 2)
+ punpcklwd mm6, mm1 ; mm6=(00 01 02 03 10 11 12 13)
+ punpckhwd mm4, mm1 ; mm4=(04 05 06 07 14 15 16 17)
+
+ movq mm7, mm6 ; transpose coefficients(phase 3)
+ punpckldq mm6, mm4 ; mm6=(00 01 02 03 04 05 06 07)
+ punpckhdq mm7, mm4 ; mm7=(10 11 12 13 14 15 16 17)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr
+ add edi, byte 2*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ femms ; empty MMX/3DNow! state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctflt-sse.asm b/media/libjpeg/simd/i386/jidctflt-sse.asm
new file mode 100644
index 0000000000..b27ecfdf46
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-sse.asm
@@ -0,0 +1,571 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414 times 4 dd 1.414213562373095048801689
+PD_1_847 times 4 dd 1.847759065022573512256366
+PD_1_082 times 4 dd 1.082392200292393968799446
+PD_M2_613 times 4 dd -2.613125929752753055713286
+PD_0_125 times 4 dd 0.125 ; 1/8
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+ ; FAST_FLOAT workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_float_sse)
+
+EXTN(jsimd_idct_float_sse):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; FAST_FLOAT *wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por mm1, mm0
+ packsswb mm1, mm1
+ movd eax, mm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+ punpckhwd mm1, mm0 ; mm1=(** 02 ** 03)
+ punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
+ psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03)
+ psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01)
+ cvtpi2ps xmm3, mm1 ; xmm3=(02 03 ** **)
+ cvtpi2ps xmm0, mm0 ; xmm0=(00 01 ** **)
+ movlhps xmm0, xmm3 ; xmm0=in0=(00 01 02 03)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, xmm0
+
+ shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
+ shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
+ shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
+ shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+ jmp near .nextcolumn
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ punpckhwd mm4, mm0 ; mm4=(** 02 ** 03)
+ punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
+ punpckhwd mm5, mm1 ; mm5=(** 22 ** 23)
+ punpcklwd mm1, mm1 ; mm1=(20 20 21 21)
+
+ psrad mm4, (DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03)
+ psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01)
+ cvtpi2ps xmm4, mm4 ; xmm4=(02 03 ** **)
+ cvtpi2ps xmm0, mm0 ; xmm0=(00 01 ** **)
+ psrad mm5, (DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23)
+ psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21)
+ cvtpi2ps xmm5, mm5 ; xmm5=(22 23 ** **)
+ cvtpi2ps xmm1, mm1 ; xmm1=(20 21 ** **)
+
+ punpckhwd mm6, mm2 ; mm6=(** 42 ** 43)
+ punpcklwd mm2, mm2 ; mm2=(40 40 41 41)
+ punpckhwd mm7, mm3 ; mm7=(** 62 ** 63)
+ punpcklwd mm3, mm3 ; mm3=(60 60 61 61)
+
+ psrad mm6, (DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43)
+ psrad mm2, (DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41)
+ cvtpi2ps xmm6, mm6 ; xmm6=(42 43 ** **)
+ cvtpi2ps xmm2, mm2 ; xmm2=(40 41 ** **)
+ psrad mm7, (DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63)
+ psrad mm3, (DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61)
+ cvtpi2ps xmm7, mm7 ; xmm7=(62 63 ** **)
+ cvtpi2ps xmm3, mm3 ; xmm3=(60 61 ** **)
+
+ movlhps xmm0, xmm4 ; xmm0=in0=(00 01 02 03)
+ movlhps xmm1, xmm5 ; xmm1=in2=(20 21 22 23)
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movlhps xmm2, xmm6 ; xmm2=in4=(40 41 42 43)
+ movlhps xmm3, xmm7 ; xmm3=in6=(60 61 62 63)
+ mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [GOTOFF(ebx,PD_1_414)]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ punpckhwd mm6, mm4 ; mm6=(** 12 ** 13)
+ punpcklwd mm4, mm4 ; mm4=(10 10 11 11)
+ punpckhwd mm2, mm0 ; mm2=(** 32 ** 33)
+ punpcklwd mm0, mm0 ; mm0=(30 30 31 31)
+
+ psrad mm6, (DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13)
+ psrad mm4, (DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11)
+ cvtpi2ps xmm4, mm6 ; xmm4=(12 13 ** **)
+ cvtpi2ps xmm2, mm4 ; xmm2=(10 11 ** **)
+ psrad mm2, (DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33)
+ psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31)
+ cvtpi2ps xmm0, mm2 ; xmm0=(32 33 ** **)
+ cvtpi2ps xmm3, mm0 ; xmm3=(30 31 ** **)
+
+ punpckhwd mm7, mm5 ; mm7=(** 52 ** 53)
+ punpcklwd mm5, mm5 ; mm5=(50 50 51 51)
+ punpckhwd mm3, mm1 ; mm3=(** 72 ** 73)
+ punpcklwd mm1, mm1 ; mm1=(70 70 71 71)
+
+ movlhps xmm2, xmm4 ; xmm2=in1=(10 11 12 13)
+ movlhps xmm3, xmm0 ; xmm3=in3=(30 31 32 33)
+
+ psrad mm7, (DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53)
+ psrad mm5, (DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51)
+ cvtpi2ps xmm4, mm7 ; xmm4=(52 53 ** **)
+ cvtpi2ps xmm5, mm5 ; xmm5=(50 51 ** **)
+ psrad mm3, (DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73)
+ psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71)
+ cvtpi2ps xmm0, mm3 ; xmm0=(72 73 ** **)
+ cvtpi2ps xmm1, mm1 ; xmm1=(70 71 ** **)
+
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movlhps xmm5, xmm4 ; xmm5=in5=(50 51 52 53)
+ movlhps xmm1, xmm0 ; xmm1=in7=(70 71 72 73)
+ mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
+ addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
+ subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
+ subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
+ movaps xmm3, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
+ unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
+
+ movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
+ movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm0, xmm7
+ movaps xmm3, xmm5
+ addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
+ addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
+ subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
+ subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
+
+ movaps xmm2, xmm7 ; transpose coefficients(phase 1)
+ unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
+ unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
+ movaps xmm4, xmm5 ; transpose coefficients(phase 1)
+ unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
+ unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
+
+ movaps xmm3, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
+ unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
+ movaps xmm0, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
+ unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
+ movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+ movaps xmm6, xmm5 ; transpose coefficients(phase 2)
+ unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
+ unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
+ movaps xmm3, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
+ unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; FAST_FLOAT *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.rowloop:
+
+ ; -- Even part
+
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [GOTOFF(ebx,PD_1_414)]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
+ addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
+ subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
+ subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, [GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125]
+
+ mulps xmm6, xmm1 ; descale(1/8)
+ mulps xmm7, xmm1 ; descale(1/8)
+ mulps xmm5, xmm1 ; descale(1/8)
+ mulps xmm0, xmm1 ; descale(1/8)
+
+ movhlps xmm3, xmm6
+ movhlps xmm1, xmm7
+ cvtps2pi mm0, xmm6 ; round to int32, mm0=data0L=(00 10)
+ cvtps2pi mm1, xmm7 ; round to int32, mm1=data1L=(01 11)
+ cvtps2pi mm2, xmm3 ; round to int32, mm2=data0H=(20 30)
+ cvtps2pi mm3, xmm1 ; round to int32, mm3=data1H=(21 31)
+ packssdw mm0, mm2 ; mm0=data0=(00 10 20 30)
+ packssdw mm1, mm3 ; mm1=data1=(01 11 21 31)
+
+ movhlps xmm6, xmm5
+ movhlps xmm7, xmm0
+ cvtps2pi mm4, xmm5 ; round to int32, mm4=data7L=(07 17)
+ cvtps2pi mm5, xmm0 ; round to int32, mm5=data6L=(06 16)
+ cvtps2pi mm6, xmm6 ; round to int32, mm6=data7H=(27 37)
+ cvtps2pi mm7, xmm7 ; round to int32, mm7=data6H=(26 36)
+ packssdw mm4, mm6 ; mm4=data7=(07 17 27 37)
+ packssdw mm5, mm7 ; mm5=data6=(06 16 26 36)
+
+ packsswb mm0, mm5 ; mm0=(00 10 20 30 06 16 26 36)
+ packsswb mm1, mm4 ; mm1=(01 11 21 31 07 17 27 37)
+
+ movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2
+ movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
+
+ movaps xmm6, [GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125]
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm5, xmm3
+ movaps xmm0, xmm1
+ addps xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
+ addps xmm1, xmm4 ; xmm1=data4=(04 14 24 34)
+ subps xmm5, xmm2 ; xmm5=data5=(05 15 25 35)
+ subps xmm0, xmm4 ; xmm0=data3=(03 13 23 33)
+
+ mulps xmm3, xmm6 ; descale(1/8)
+ mulps xmm1, xmm6 ; descale(1/8)
+ mulps xmm5, xmm6 ; descale(1/8)
+ mulps xmm0, xmm6 ; descale(1/8)
+
+ movhlps xmm7, xmm3
+ movhlps xmm2, xmm1
+ cvtps2pi mm2, xmm3 ; round to int32, mm2=data2L=(02 12)
+ cvtps2pi mm3, xmm1 ; round to int32, mm3=data4L=(04 14)
+ cvtps2pi mm6, xmm7 ; round to int32, mm6=data2H=(22 32)
+ cvtps2pi mm7, xmm2 ; round to int32, mm7=data4H=(24 34)
+ packssdw mm2, mm6 ; mm2=data2=(02 12 22 32)
+ packssdw mm3, mm7 ; mm3=data4=(04 14 24 34)
+
+ movhlps xmm4, xmm5
+ movhlps xmm6, xmm0
+ cvtps2pi mm5, xmm5 ; round to int32, mm5=data5L=(05 15)
+ cvtps2pi mm4, xmm0 ; round to int32, mm4=data3L=(03 13)
+ cvtps2pi mm6, xmm4 ; round to int32, mm6=data5H=(25 35)
+ cvtps2pi mm7, xmm6 ; round to int32, mm7=data3H=(23 33)
+ packssdw mm5, mm6 ; mm5=data5=(05 15 25 35)
+ packssdw mm4, mm7 ; mm4=data3=(03 13 23 33)
+
+ movq mm6, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP]
+
+ packsswb mm2, mm3 ; mm2=(02 12 22 32 04 14 24 34)
+ packsswb mm4, mm5 ; mm4=(03 13 23 33 05 15 25 35)
+
+ paddb mm0, mm6
+ paddb mm1, mm6
+ paddb mm2, mm6
+ paddb mm4, mm6
+
+ movq mm7, mm0 ; transpose coefficients(phase 1)
+ punpcklbw mm0, mm1 ; mm0=(00 01 10 11 20 21 30 31)
+ punpckhbw mm7, mm1 ; mm7=(06 07 16 17 26 27 36 37)
+ movq mm3, mm2 ; transpose coefficients(phase 1)
+ punpcklbw mm2, mm4 ; mm2=(02 03 12 13 22 23 32 33)
+ punpckhbw mm3, mm4 ; mm3=(04 05 14 15 24 25 34 35)
+
+ movq mm5, mm0 ; transpose coefficients(phase 2)
+ punpcklwd mm0, mm2 ; mm0=(00 01 02 03 10 11 12 13)
+ punpckhwd mm5, mm2 ; mm5=(20 21 22 23 30 31 32 33)
+ movq mm6, mm3 ; transpose coefficients(phase 2)
+ punpcklwd mm3, mm7 ; mm3=(04 05 06 07 14 15 16 17)
+ punpckhwd mm6, mm7 ; mm6=(24 25 26 27 34 35 36 37)
+
+ movq mm1, mm0 ; transpose coefficients(phase 3)
+ punpckldq mm0, mm3 ; mm0=(00 01 02 03 04 05 06 07)
+ punpckhdq mm1, mm3 ; mm1=(10 11 12 13 14 15 16 17)
+ movq mm4, mm5 ; transpose coefficients(phase 3)
+ punpckldq mm5, mm6 ; mm5=(20 21 22 23 24 25 26 27)
+ punpckhdq mm4, mm6 ; mm4=(30 31 32 33 34 35 36 37)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctflt-sse2.asm b/media/libjpeg/simd/i386/jidctflt-sse2.asm
new file mode 100644
index 0000000000..c646eaef76
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-sse2.asm
@@ -0,0 +1,497 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414 times 4 dd 1.414213562373095048801689
+PD_1_847 times 4 dd 1.847759065022573512256366
+PD_1_082 times 4 dd 1.082392200292393968799446
+PD_M2_613 times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+ ; FAST_FLOAT workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; FAST_FLOAT *wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm1, xmm2
+ por xmm3, xmm4
+ por xmm5, xmm6
+ por xmm1, xmm3
+ por xmm5, xmm7
+ por xmm1, xmm5
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, xmm0
+
+ shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
+ shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
+ shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
+ shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+ jmp near .nextcolumn
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
+ psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
+ cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
+
+ punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
+ punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
+ psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
+ psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
+ cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
+ cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [GOTOFF(ebx,PD_1_414)]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
+ punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
+ psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
+ psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
+ cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
+ cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
+
+ punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
+ punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
+ psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
+ psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
+ cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
+ cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
+
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
+ addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
+ subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
+ subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
+ movaps xmm3, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
+ unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
+
+ movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
+ movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm0, xmm7
+ movaps xmm3, xmm5
+ addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
+ addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
+ subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
+ subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
+
+ movaps xmm2, xmm7 ; transpose coefficients(phase 1)
+ unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
+ unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
+ movaps xmm4, xmm5 ; transpose coefficients(phase 1)
+ unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
+ unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
+
+ movaps xmm3, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
+ unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
+ movaps xmm0, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
+ unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
+ movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+ movaps xmm6, xmm5 ; transpose coefficients(phase 2)
+ unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
+ unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
+ movaps xmm3, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
+ unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; FAST_FLOAT *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.rowloop:
+
+ ; -- Even part
+
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [GOTOFF(ebx,PD_1_414)]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
+ addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
+ subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
+ subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
+ pcmpeqd xmm3, xmm3
+ psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+ addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+ addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+ addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+ pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
+ pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+ pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
+ pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+ por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
+ por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
+
+ movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm7, xmm1
+ movaps xmm5, xmm3
+ addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
+ addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
+ subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
+ subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
+
+ movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
+ pcmpeqd xmm4, xmm4
+ psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+ addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+ addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+ addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+ pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
+ pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+ pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
+ pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+ por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
+ por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
+
+ packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+ packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+ paddb xmm6, xmm2
+ paddb xmm1, xmm2
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
+ punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+ pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctfst-mmx.asm b/media/libjpeg/simd/i386/jidctfst-mmx.asm
new file mode 100644
index 0000000000..24622d4369
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctfst-mmx.asm
@@ -0,0 +1,499 @@
+;
+; jidctfst.asm - fast integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+%define PASS1_BITS 2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277 ; FIX(1.082392200)
+F_1_414 equ 362 ; FIX(1.414213562)
+F_1_847 equ 473 ; FIX(1.847759065)
+F_2_613 equ 669 ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414 times 4 dw F_1_414 << CONST_SHIFT
+PW_F1847 times 4 dw F_1_847 << CONST_SHIFT
+PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082 times 4 dw F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+ ; JCOEF workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_ifast_mmx)
+
+EXTN(jsimd_idct_ifast_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; JCOEF *wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por mm1, mm0
+ packsswb mm1, mm1
+ movd eax, mm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movq mm2, mm0 ; mm0=in0=(00 01 02 03)
+ punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
+ punpckhwd mm2, mm2 ; mm2=(02 02 03 03)
+
+ movq mm1, mm0
+ punpckldq mm0, mm0 ; mm0=(00 00 00 00)
+ punpckhdq mm1, mm1 ; mm1=(01 01 01 01)
+ movq mm3, mm2
+ punpckldq mm2, mm2 ; mm2=(02 02 02 02)
+ punpckhdq mm3, mm3 ; mm3=(03 03 03 03)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+ jmp near .nextcolumn
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movq mm4, mm0
+ movq mm5, mm1
+ psubw mm0, mm2 ; mm0=tmp11
+ psubw mm1, mm3
+ paddw mm4, mm2 ; mm4=tmp10
+ paddw mm5, mm3 ; mm5=tmp13
+
+ psllw mm1, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm1, [GOTOFF(ebx,PW_F1414)]
+ psubw mm1, mm5 ; mm1=tmp12
+
+ movq mm6, mm4
+ movq mm7, mm0
+ psubw mm4, mm5 ; mm4=tmp3
+ psubw mm0, mm1 ; mm0=tmp2
+ paddw mm6, mm5 ; mm6=tmp0
+ paddw mm7, mm1 ; mm7=tmp1
+
+ movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
+ movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movq mm4, mm2
+ movq mm0, mm5
+ psubw mm2, mm1 ; mm2=z12
+ psubw mm5, mm3 ; mm5=z10
+ paddw mm4, mm1 ; mm4=z11
+ paddw mm0, mm3 ; mm0=z13
+
+ movq mm1, mm5 ; mm1=z10(unscaled)
+ psllw mm2, PRE_MULTIPLY_SCALE_BITS
+ psllw mm5, PRE_MULTIPLY_SCALE_BITS
+
+ movq mm3, mm4
+ psubw mm4, mm0
+ paddw mm3, mm0 ; mm3=tmp7
+
+ psllw mm4, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movq mm0, mm5
+ paddw mm5, mm2
+ pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5
+ pmulhw mm0, [GOTOFF(ebx,PW_MF1613)]
+ pmulhw mm2, [GOTOFF(ebx,PW_F1082)]
+ psubw mm0, mm1
+ psubw mm2, mm5 ; mm2=tmp10
+ paddw mm0, mm5 ; mm0=tmp12
+
+ ; -- Final output stage
+
+ psubw mm0, mm3 ; mm0=tmp6
+ movq mm1, mm6
+ movq mm5, mm7
+ paddw mm6, mm3 ; mm6=data0=(00 01 02 03)
+ paddw mm7, mm0 ; mm7=data1=(10 11 12 13)
+ psubw mm1, mm3 ; mm1=data7=(70 71 72 73)
+ psubw mm5, mm0 ; mm5=data6=(60 61 62 63)
+ psubw mm4, mm0 ; mm4=tmp5
+
+ movq mm3, mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6, mm7 ; mm6=(00 10 01 11)
+ punpckhwd mm3, mm7 ; mm3=(02 12 03 13)
+ movq mm0, mm5 ; transpose coefficients(phase 1)
+ punpcklwd mm5, mm1 ; mm5=(60 70 61 71)
+ punpckhwd mm0, mm1 ; mm0=(62 72 63 73)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=tmp2
+ movq mm1, MMWORD [wk(1)] ; mm1=tmp3
+
+ movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71)
+ movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73)
+
+ paddw mm2, mm4 ; mm2=tmp4
+ movq mm5, mm7
+ movq mm0, mm1
+ paddw mm7, mm4 ; mm7=data2=(20 21 22 23)
+ paddw mm1, mm2 ; mm1=data4=(40 41 42 43)
+ psubw mm5, mm4 ; mm5=data5=(50 51 52 53)
+ psubw mm0, mm2 ; mm0=data3=(30 31 32 33)
+
+ movq mm4, mm7 ; transpose coefficients(phase 1)
+ punpcklwd mm7, mm0 ; mm7=(20 30 21 31)
+ punpckhwd mm4, mm0 ; mm4=(22 32 23 33)
+ movq mm2, mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1, mm5 ; mm1=(40 50 41 51)
+ punpckhwd mm2, mm5 ; mm2=(42 52 43 53)
+
+ movq mm0, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm7 ; mm6=(00 10 20 30)
+ punpckhdq mm0, mm7 ; mm0=(01 11 21 31)
+ movq mm5, mm3 ; transpose coefficients(phase 2)
+ punpckldq mm3, mm4 ; mm3=(02 12 22 32)
+ punpckhdq mm5, mm4 ; mm5=(03 13 23 33)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71)
+ movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+ movq mm6, mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1, mm7 ; mm1=(40 50 60 70)
+ punpckhdq mm6, mm7 ; mm6=(41 51 61 71)
+ movq mm0, mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2, mm4 ; mm2=(42 52 62 72)
+ punpckhdq mm0, mm4 ; mm0=(43 53 63 73)
+
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr
+ add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; JCOEF *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.rowloop:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ movq mm4, mm0
+ movq mm5, mm1
+ psubw mm0, mm2 ; mm0=tmp11
+ psubw mm1, mm3
+ paddw mm4, mm2 ; mm4=tmp10
+ paddw mm5, mm3 ; mm5=tmp13
+
+ psllw mm1, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm1, [GOTOFF(ebx,PW_F1414)]
+ psubw mm1, mm5 ; mm1=tmp12
+
+ movq mm6, mm4
+ movq mm7, mm0
+ psubw mm4, mm5 ; mm4=tmp3
+ psubw mm0, mm1 ; mm0=tmp2
+ paddw mm6, mm5 ; mm6=tmp0
+ paddw mm7, mm1 ; mm7=tmp1
+
+ movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
+ movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ movq mm4, mm2
+ movq mm0, mm5
+ psubw mm2, mm1 ; mm2=z12
+ psubw mm5, mm3 ; mm5=z10
+ paddw mm4, mm1 ; mm4=z11
+ paddw mm0, mm3 ; mm0=z13
+
+ movq mm1, mm5 ; mm1=z10(unscaled)
+ psllw mm2, PRE_MULTIPLY_SCALE_BITS
+ psllw mm5, PRE_MULTIPLY_SCALE_BITS
+
+ movq mm3, mm4
+ psubw mm4, mm0
+ paddw mm3, mm0 ; mm3=tmp7
+
+ psllw mm4, PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movq mm0, mm5
+ paddw mm5, mm2
+ pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5
+ pmulhw mm0, [GOTOFF(ebx,PW_MF1613)]
+ pmulhw mm2, [GOTOFF(ebx,PW_F1082)]
+ psubw mm0, mm1
+ psubw mm2, mm5 ; mm2=tmp10
+ paddw mm0, mm5 ; mm0=tmp12
+
+ ; -- Final output stage
+
+ psubw mm0, mm3 ; mm0=tmp6
+ movq mm1, mm6
+ movq mm5, mm7
+ paddw mm6, mm3 ; mm6=data0=(00 10 20 30)
+ paddw mm7, mm0 ; mm7=data1=(01 11 21 31)
+ psraw mm6, (PASS1_BITS+3) ; descale
+ psraw mm7, (PASS1_BITS+3) ; descale
+ psubw mm1, mm3 ; mm1=data7=(07 17 27 37)
+ psubw mm5, mm0 ; mm5=data6=(06 16 26 36)
+ psraw mm1, (PASS1_BITS+3) ; descale
+ psraw mm5, (PASS1_BITS+3) ; descale
+ psubw mm4, mm0 ; mm4=tmp5
+
+ packsswb mm6, mm5 ; mm6=(00 10 20 30 06 16 26 36)
+ packsswb mm7, mm1 ; mm7=(01 11 21 31 07 17 27 37)
+
+ movq mm3, MMWORD [wk(0)] ; mm3=tmp2
+ movq mm0, MMWORD [wk(1)] ; mm0=tmp3
+
+ paddw mm2, mm4 ; mm2=tmp4
+ movq mm5, mm3
+ movq mm1, mm0
+ paddw mm3, mm4 ; mm3=data2=(02 12 22 32)
+ paddw mm0, mm2 ; mm0=data4=(04 14 24 34)
+ psraw mm3, (PASS1_BITS+3) ; descale
+ psraw mm0, (PASS1_BITS+3) ; descale
+ psubw mm5, mm4 ; mm5=data5=(05 15 25 35)
+ psubw mm1, mm2 ; mm1=data3=(03 13 23 33)
+ psraw mm5, (PASS1_BITS+3) ; descale
+ psraw mm1, (PASS1_BITS+3) ; descale
+
+ movq mm4, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP]
+
+ packsswb mm3, mm0 ; mm3=(02 12 22 32 04 14 24 34)
+ packsswb mm1, mm5 ; mm1=(03 13 23 33 05 15 25 35)
+
+ paddb mm6, mm4
+ paddb mm7, mm4
+ paddb mm3, mm4
+ paddb mm1, mm4
+
+ movq mm2, mm6 ; transpose coefficients(phase 1)
+ punpcklbw mm6, mm7 ; mm6=(00 01 10 11 20 21 30 31)
+ punpckhbw mm2, mm7 ; mm2=(06 07 16 17 26 27 36 37)
+ movq mm0, mm3 ; transpose coefficients(phase 1)
+ punpcklbw mm3, mm1 ; mm3=(02 03 12 13 22 23 32 33)
+ punpckhbw mm0, mm1 ; mm0=(04 05 14 15 24 25 34 35)
+
+ movq mm5, mm6 ; transpose coefficients(phase 2)
+ punpcklwd mm6, mm3 ; mm6=(00 01 02 03 10 11 12 13)
+ punpckhwd mm5, mm3 ; mm5=(20 21 22 23 30 31 32 33)
+ movq mm4, mm0 ; transpose coefficients(phase 2)
+ punpcklwd mm0, mm2 ; mm0=(04 05 06 07 14 15 16 17)
+ punpckhwd mm4, mm2 ; mm4=(24 25 26 27 34 35 36 37)
+
+ movq mm7, mm6 ; transpose coefficients(phase 3)
+ punpckldq mm6, mm0 ; mm6=(00 01 02 03 04 05 06 07)
+ punpckhdq mm7, mm0 ; mm7=(10 11 12 13 14 15 16 17)
+ movq mm1, mm5 ; transpose coefficients(phase 3)
+ punpckldq mm5, mm4 ; mm5=(20 21 22 23 24 25 26 27)
+ punpckhdq mm1, mm4 ; mm1=(30 31 32 33 34 35 36 37)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_JCOEF ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctfst-sse2.asm b/media/libjpeg/simd/i386/jidctfst-sse2.asm
new file mode 100644
index 0000000000..19704ffa48
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctfst-sse2.asm
@@ -0,0 +1,501 @@
+;
+; jidctfst.asm - fast integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+%define PASS1_BITS 2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277 ; FIX(1.082392200)
+F_1_414 equ 362 ; FIX(1.414213562)
+F_1_847 equ 473 ; FIX(1.847759065)
+F_2_613 equ 669 ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
+PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
+PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm1, xmm0
+ packsswb xmm1, xmm1
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
+ jmp near .column_end
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ psubw xmm0, xmm2 ; xmm0=tmp11
+ psubw xmm1, xmm3
+ paddw xmm4, xmm2 ; xmm4=tmp10
+ paddw xmm5, xmm3 ; xmm5=tmp13
+
+ psllw xmm1, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm1, [GOTOFF(ebx,PW_F1414)]
+ psubw xmm1, xmm5 ; xmm1=tmp12
+
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm0
+ psubw xmm4, xmm5 ; xmm4=tmp3
+ psubw xmm0, xmm1 ; xmm0=tmp2
+ paddw xmm6, xmm5 ; xmm6=tmp0
+ paddw xmm7, xmm1 ; xmm7=tmp1
+
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4, xmm2
+ movdqa xmm0, xmm5
+ psubw xmm2, xmm1 ; xmm2=z12
+ psubw xmm5, xmm3 ; xmm5=z10
+ paddw xmm4, xmm1 ; xmm4=z11
+ paddw xmm0, xmm3 ; xmm0=z13
+
+ movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm3, xmm4
+ psubw xmm4, xmm0
+ paddw xmm3, xmm0 ; xmm3=tmp7
+
+ psllw xmm4, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm4, [GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm0, xmm5
+ paddw xmm5, xmm2
+ pmulhw xmm5, [GOTOFF(ebx,PW_F1847)] ; xmm5=z5
+ pmulhw xmm0, [GOTOFF(ebx,PW_MF1613)]
+ pmulhw xmm2, [GOTOFF(ebx,PW_F1082)]
+ psubw xmm0, xmm1
+ psubw xmm2, xmm5 ; xmm2=tmp10
+ paddw xmm0, xmm5 ; xmm0=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm0, xmm3 ; xmm0=tmp6
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm7
+ paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
+ paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
+ psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
+ psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
+ psubw xmm4, xmm0 ; xmm4=tmp5
+
+ movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
+ movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
+
+ paddw xmm2, xmm4 ; xmm2=tmp4
+ movdqa xmm5, xmm7
+ movdqa xmm0, xmm1
+ paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
+ paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
+ psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+ psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
+ punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
+
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
+ movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
+ punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
+ punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Even part
+
+ ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+ movdqa xmm2, xmm6
+ movdqa xmm0, xmm5
+ psubw xmm6, xmm1 ; xmm6=tmp11
+ psubw xmm5, xmm3
+ paddw xmm2, xmm1 ; xmm2=tmp10
+ paddw xmm0, xmm3 ; xmm0=tmp13
+
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [GOTOFF(ebx,PW_F1414)]
+ psubw xmm5, xmm0 ; xmm5=tmp12
+
+ movdqa xmm1, xmm2
+ movdqa xmm3, xmm6
+ psubw xmm2, xmm0 ; xmm2=tmp3
+ psubw xmm6, xmm5 ; xmm6=tmp2
+ paddw xmm1, xmm0 ; xmm1=tmp0
+ paddw xmm3, xmm5 ; xmm3=tmp1
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
+
+ ; -- Odd part
+
+ ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+ movdqa xmm2, xmm0
+ movdqa xmm6, xmm4
+ psubw xmm0, xmm7 ; xmm0=z12
+ psubw xmm4, xmm5 ; xmm4=z10
+ paddw xmm2, xmm7 ; xmm2=z11
+ paddw xmm6, xmm5 ; xmm6=z13
+
+ movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm4, PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm5, xmm2
+ psubw xmm2, xmm6
+ paddw xmm5, xmm6 ; xmm5=tmp7
+
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm2, [GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm6, xmm4
+ paddw xmm4, xmm0
+ pmulhw xmm4, [GOTOFF(ebx,PW_F1847)] ; xmm4=z5
+ pmulhw xmm6, [GOTOFF(ebx,PW_MF1613)]
+ pmulhw xmm0, [GOTOFF(ebx,PW_F1082)]
+ psubw xmm6, xmm7
+ psubw xmm0, xmm4 ; xmm0=tmp10
+ paddw xmm6, xmm4 ; xmm6=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm6, xmm5 ; xmm6=tmp6
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm3
+ paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
+ paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ psraw xmm1, (PASS1_BITS+3) ; descale
+ psraw xmm3, (PASS1_BITS+3) ; descale
+ psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
+ psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+ psraw xmm7, (PASS1_BITS+3) ; descale
+ psraw xmm4, (PASS1_BITS+3) ; descale
+ psubw xmm2, xmm6 ; xmm2=tmp5
+
+ packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
+
+ paddw xmm0, xmm2 ; xmm0=tmp4
+ movdqa xmm4, xmm5
+ movdqa xmm7, xmm6
+ paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
+ paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
+ psraw xmm5, (PASS1_BITS+3) ; descale
+ psraw xmm6, (PASS1_BITS+3) ; descale
+ psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+ psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
+ psraw xmm4, (PASS1_BITS+3) ; descale
+ psraw xmm7, (PASS1_BITS+3) ; descale
+
+ movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
+
+ packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm1, xmm2
+ paddb xmm3, xmm2
+ paddb xmm5, xmm2
+ paddb xmm7, xmm2
+
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
+ punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
+ punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
+ punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
+ punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+ mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+ mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctint-avx2.asm b/media/libjpeg/simd/i386/jidctint-avx2.asm
new file mode 100644
index 0000000000..199c7df3b6
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-avx2.asm
@@ -0,0 +1,453 @@
+;
+; jidctint.asm - accurate integer IDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+ ; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71)
+ ; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
+ ; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
+ ; %8=(07 17 27 37 47 57 67 77 06 16 26 36 46 56 66 76)
+
+ vpermq %5, %1, 0xD8
+ vpermq %6, %2, 0x72
+ vpermq %7, %3, 0xD8
+ vpermq %8, %4, 0x72
+ ; transpose coefficients(phase 1)
+ ; %5=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
+ ; %6=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
+ ; %7=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
+ ; %8=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
+
+ vpunpcklwd %1, %5, %6
+ vpunpckhwd %2, %5, %6
+ vpunpcklwd %3, %7, %8
+ vpunpckhwd %4, %7, %8
+ ; transpose coefficients(phase 2)
+ ; %1=(00 02 10 12 20 22 30 32 40 42 50 52 60 62 70 72)
+ ; %2=(01 03 11 13 21 23 31 33 41 43 51 53 61 63 71 73)
+ ; %3=(04 06 14 16 24 26 34 36 44 46 54 56 64 66 74 76)
+ ; %4=(05 07 15 17 25 27 35 37 45 47 55 57 65 67 75 77)
+
+ vpunpcklwd %5, %1, %2
+ vpunpcklwd %6, %3, %4
+ vpunpckhwd %7, %1, %2
+ vpunpckhwd %8, %3, %4
+ ; transpose coefficients(phase 3)
+ ; %5=(00 01 02 03 10 11 12 13 40 41 42 43 50 51 52 53)
+ ; %6=(04 05 06 07 14 15 16 17 44 45 46 47 54 55 56 57)
+ ; %7=(20 21 22 23 30 31 32 33 60 61 62 63 70 71 72 73)
+ ; %8=(24 25 26 27 34 35 36 37 64 65 66 67 74 75 76 77)
+
+ vpunpcklqdq %1, %5, %6
+ vpunpckhqdq %2, %5, %6
+ vpunpcklqdq %3, %7, %8
+ vpunpckhqdq %4, %7, %8
+ ; transpose coefficients(phase 4)
+ ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
+ ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
+ ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
+ ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%12: Temp registers
+; %9: Pass (1 or 2)
+
+%macro dodct 13
+ ; -- Even part
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ vperm2i128 %6, %3, %3, 0x01 ; %6=in6_2
+ vpunpcklwd %5, %3, %6 ; %5=in26_62L
+ vpunpckhwd %6, %3, %6 ; %6=in26_62H
+ vpmaddwd %5, %5, [GOTOFF(ebx,PW_F130_F054_MF130_F054)] ; %5=tmp3_2L
+ vpmaddwd %6, %6, [GOTOFF(ebx,PW_F130_F054_MF130_F054)] ; %6=tmp3_2H
+
+ vperm2i128 %7, %1, %1, 0x01 ; %7=in4_0
+ vpsignw %1, %1, [GOTOFF(ebx,PW_1_NEG1)]
+ vpaddw %7, %7, %1 ; %7=(in0+in4)_(in0-in4)
+
+ vpxor %1, %1, %1
+ vpunpcklwd %8, %1, %7 ; %8=tmp0_1L
+ vpunpckhwd %1, %1, %7 ; %1=tmp0_1H
+ vpsrad %8, %8, (16-CONST_BITS) ; vpsrad %8,16 & vpslld %8,CONST_BITS
+ vpsrad %1, %1, (16-CONST_BITS) ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+ vpsubd %3, %8, %5
+ vmovdqu %11, %3 ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+ vpaddd %3, %8, %5
+ vmovdqu %9, %3 ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+ vpsubd %3, %1, %6
+ vmovdqu %12, %3 ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+ vpaddd %3, %1, %6
+ vmovdqu %10, %3 ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+ ; -- Odd part
+
+ vpaddw %1, %4, %2 ; %1=in7_5+in3_1=z3_4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ vperm2i128 %8, %1, %1, 0x01 ; %8=z4_3
+ vpunpcklwd %7, %1, %8 ; %7=z34_43L
+ vpunpckhwd %8, %1, %8 ; %8=z34_43H
+ vpmaddwd %7, %7, [GOTOFF(ebx,PW_MF078_F117_F078_F117)] ; %7=z3_4L
+ vpmaddwd %8, %8, [GOTOFF(ebx,PW_MF078_F117_F078_F117)] ; %8=z3_4H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ vperm2i128 %2, %2, %2, 0x01 ; %2=in1_3
+ vpunpcklwd %3, %4, %2 ; %3=in71_53L
+ vpunpckhwd %4, %4, %2 ; %4=in71_53H
+
+ vpmaddwd %5, %3, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)] ; %5=tmp0_1L
+ vpmaddwd %6, %4, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)] ; %6=tmp0_1H
+ vpaddd %5, %5, %7 ; %5=tmp0_1L+z3_4L=tmp0_1L
+ vpaddd %6, %6, %8 ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+ vpmaddwd %3, %3, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)] ; %3=tmp3_2L
+ vpmaddwd %4, %4, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)] ; %4=tmp3_2H
+ vperm2i128 %7, %7, %7, 0x01 ; %7=z4_3L
+ vperm2i128 %8, %8, %8, 0x01 ; %8=z4_3H
+ vpaddd %7, %3, %7 ; %7=tmp3_2L+z4_3L=tmp3_2L
+ vpaddd %8, %4, %8 ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+ ; -- Final output stage
+
+ vmovdqu %3, %9
+ vmovdqu %4, %10
+
+ vpaddd %1, %3, %7 ; %1=tmp10_11L+tmp3_2L=data0_1L
+ vpaddd %2, %4, %8 ; %2=tmp10_11H+tmp3_2H=data0_1H
+ vpaddd %1, %1, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpaddd %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpsrad %1, %1, DESCALE_P %+ %13
+ vpsrad %2, %2, DESCALE_P %+ %13
+ vpackssdw %1, %1, %2 ; %1=data0_1
+
+ vpsubd %3, %3, %7 ; %3=tmp10_11L-tmp3_2L=data7_6L
+ vpsubd %4, %4, %8 ; %4=tmp10_11H-tmp3_2H=data7_6H
+ vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpaddd %4, %4, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpsrad %3, %3, DESCALE_P %+ %13
+ vpsrad %4, %4, DESCALE_P %+ %13
+ vpackssdw %4, %3, %4 ; %4=data7_6
+
+ vmovdqu %7, %11
+ vmovdqu %8, %12
+
+ vpaddd %2, %7, %5 ; %7=tmp13_12L+tmp0_1L=data3_2L
+ vpaddd %3, %8, %6 ; %8=tmp13_12H+tmp0_1H=data3_2H
+ vpaddd %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpsrad %2, %2, DESCALE_P %+ %13
+ vpsrad %3, %3, DESCALE_P %+ %13
+ vpackssdw %2, %2, %3 ; %2=data3_2
+
+ vpsubd %3, %7, %5 ; %7=tmp13_12L-tmp0_1L=data4_5L
+ vpsubd %6, %8, %6 ; %8=tmp13_12H-tmp0_1H=data4_5H
+ vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpaddd %6, %6, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+ vpsrad %3, %3, DESCALE_P %+ %13
+ vpsrad %6, %6, DESCALE_P %+ %13
+ vpackssdw %3, %3, %6 ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+ times 4 dw (F_0_541 - F_1_847), F_0_541
+PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+ times 4 dw (F_1_175 - F_0_390), F_1_175
+PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
+ times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050 times 4 dw -F_0_899, (F_1_501 - F_0_899)
+ times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 32 db CENTERJSAMPLE
+PW_1_NEG1 times 8 dw 1
+ times 8 dw -1
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+ ; ymmword wk[WK_NUM]
+%define WK_NUM 4
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ vpor xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ vpor xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ vpor xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ vpor xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ vpor xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ vpor xmm1, xmm1, xmm0
+ vpacksswb xmm1, xmm1, xmm1
+ vpacksswb xmm1, xmm1, xmm1
+ movd eax, xmm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ vpmullw xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ vpsllw xmm5, xmm5, PASS1_BITS
+
+ vpunpcklwd xmm4, xmm5, xmm5 ; xmm4=(00 00 01 01 02 02 03 03)
+ vpunpckhwd xmm5, xmm5, xmm5 ; xmm5=(04 04 05 05 06 06 07 07)
+ vinserti128 ymm4, ymm4, xmm5, 1
+
+ vpshufd ymm0, ymm4, 0x00 ; ymm0=col0_4=(00 00 00 00 00 00 00 00 04 04 04 04 04 04 04 04)
+ vpshufd ymm1, ymm4, 0x55 ; ymm1=col1_5=(01 01 01 01 01 01 01 01 05 05 05 05 05 05 05 05)
+ vpshufd ymm2, ymm4, 0xAA ; ymm2=col2_6=(02 02 02 02 02 02 02 02 06 06 06 06 06 06 06 06)
+ vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07)
+
+ jmp near .column_end
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,esi,SIZEOF_JCOEF)] ; ymm4=in0_1
+ vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,esi,SIZEOF_JCOEF)] ; ymm5=in2_3
+ vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,esi,SIZEOF_JCOEF)] ; ymm6=in4_5
+ vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,esi,SIZEOF_JCOEF)] ; ymm7=in6_7
+ vpmullw ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ vpmullw ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ vpmullw ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ vpmullw ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ vperm2i128 ymm0, ymm4, ymm6, 0x20 ; ymm0=in0_4
+ vperm2i128 ymm1, ymm5, ymm4, 0x31 ; ymm1=in3_1
+ vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6
+ vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5
+
+ dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1
+ ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+ dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+ ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5
+ vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1
+
+ dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2
+ ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+ dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+ ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+ vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45
+ vpacksswb ymm1, ymm2, ymm4 ; ymm1=data23_67
+ vpaddb ymm0, ymm0, [GOTOFF(ebx,PB_CENTERJSAMP)]
+ vpaddb ymm1, ymm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+ vextracti128 xmm6, ymm1, 1 ; xmm3=data67
+ vextracti128 xmm4, ymm0, 1 ; xmm2=data45
+ vextracti128 xmm2, ymm1, 0 ; xmm1=data23
+ vextracti128 xmm0, ymm0, 0 ; xmm0=data01
+
+ vpshufd xmm1, xmm0, 0x4E ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ vpshufd xmm3, xmm2, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ vpshufd xmm5, xmm4, 0x4E ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ vpshufd xmm7, xmm6, 0x4E ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ vzeroupper
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm0
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+ mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov esi, JSAMPROW [edi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+ mov edx, JSAMPROW [edi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctint-mmx.asm b/media/libjpeg/simd/i386/jidctint-mmx.asm
new file mode 100644
index 0000000000..f15c8d34bc
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-mmx.asm
@@ -0,0 +1,851 @@
+;
+; jidctint.asm - accurate integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541
+PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175
+PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_mmx(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 12
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+ ; JCOEF workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_islow_mmx)
+
+EXTN(jsimd_idct_islow_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; JCOEF *wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por mm1, mm0
+ packsswb mm1, mm1
+ movd eax, mm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw mm0, PASS1_BITS
+
+ movq mm2, mm0 ; mm0=in0=(00 01 02 03)
+ punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
+ punpckhwd mm2, mm2 ; mm2=(02 02 03 03)
+
+ movq mm1, mm0
+ punpckldq mm0, mm0 ; mm0=(00 00 00 00)
+ punpckhdq mm1, mm1 ; mm1=(01 01 01 01)
+ movq mm3, mm2
+ punpckldq mm2, mm2 ; mm2=(02 02 02 02)
+ punpckhdq mm3, mm3 ; mm3=(03 03 03 03)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+ jmp near .nextcolumn
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movq mm4, mm1 ; mm1=in2=z2
+ movq mm5, mm1
+ punpcklwd mm4, mm3 ; mm3=in6=z3
+ punpckhwd mm5, mm3
+ movq mm1, mm4
+ movq mm3, mm5
+ pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L
+ pmaddwd mm5, [GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H
+ pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L
+ pmaddwd mm3, [GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H
+
+ movq mm6, mm0
+ paddw mm0, mm2 ; mm0=in0+in4
+ psubw mm6, mm2 ; mm6=in0-in4
+
+ pxor mm7, mm7
+ pxor mm2, mm2
+ punpcklwd mm7, mm0 ; mm7=tmp0L
+ punpckhwd mm2, mm0 ; mm2=tmp0H
+ psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
+ psrad mm2, (16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+ movq mm0, mm7
+ paddd mm7, mm4 ; mm7=tmp10L
+ psubd mm0, mm4 ; mm0=tmp13L
+ movq mm4, mm2
+ paddd mm2, mm5 ; mm2=tmp10H
+ psubd mm4, mm5 ; mm4=tmp13H
+
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L
+ movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H
+ movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L
+ movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H
+
+ pxor mm5, mm5
+ pxor mm7, mm7
+ punpcklwd mm5, mm6 ; mm5=tmp1L
+ punpckhwd mm7, mm6 ; mm7=tmp1H
+ psrad mm5, (16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS
+ psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+ movq mm2, mm5
+ paddd mm5, mm1 ; mm5=tmp11L
+ psubd mm2, mm1 ; mm2=tmp12L
+ movq mm0, mm7
+ paddd mm7, mm3 ; mm7=tmp11H
+ psubd mm0, mm3 ; mm0=tmp12H
+
+ movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L
+ movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H
+ movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L
+ movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movq mm5, mm6
+ movq mm7, mm4
+ paddw mm5, mm3 ; mm5=z3
+ paddw mm7, mm1 ; mm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movq mm2, mm5
+ movq mm0, mm5
+ punpcklwd mm2, mm7
+ punpckhwd mm0, mm7
+ movq mm5, mm2
+ movq mm7, mm0
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L
+ pmaddwd mm0, [GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H
+ pmaddwd mm5, [GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L
+ pmaddwd mm7, [GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H
+
+ movq MMWORD [wk(10)], mm2 ; wk(10)=z3L
+ movq MMWORD [wk(11)], mm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movq mm2, mm3
+ movq mm0, mm3
+ punpcklwd mm2, mm4
+ punpckhwd mm0, mm4
+ movq mm3, mm2
+ movq mm4, mm0
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L
+ pmaddwd mm0, [GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H
+ pmaddwd mm3, [GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L
+ pmaddwd mm4, [GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H
+
+ paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L
+ paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H
+ paddd mm3, mm5 ; mm3=tmp3L
+ paddd mm4, mm7 ; mm4=tmp3H
+
+ movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L
+ movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H
+
+ movq mm2, mm1
+ movq mm0, mm1
+ punpcklwd mm2, mm6
+ punpckhwd mm0, mm6
+ movq mm1, mm2
+ movq mm6, mm0
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L
+ pmaddwd mm0, [GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L
+ pmaddwd mm6, [GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H
+
+ paddd mm2, mm5 ; mm2=tmp1L
+ paddd mm0, mm7 ; mm0=tmp1H
+ paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L
+ paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H
+
+ movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L
+ movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movq mm5, MMWORD [wk(0)] ; mm5=tmp10L
+ movq mm7, MMWORD [wk(1)] ; mm7=tmp10H
+
+ movq mm2, mm5
+ movq mm0, mm7
+ paddd mm5, mm3 ; mm5=data0L
+ paddd mm7, mm4 ; mm7=data0H
+ psubd mm2, mm3 ; mm2=data7L
+ psubd mm0, mm4 ; mm0=data7H
+
+ movq mm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1]
+
+ paddd mm5, mm3
+ paddd mm7, mm3
+ psrad mm5, DESCALE_P1
+ psrad mm7, DESCALE_P1
+ paddd mm2, mm3
+ paddd mm0, mm3
+ psrad mm2, DESCALE_P1
+ psrad mm0, DESCALE_P1
+
+ packssdw mm5, mm7 ; mm5=data0=(00 01 02 03)
+ packssdw mm2, mm0 ; mm2=data7=(70 71 72 73)
+
+ movq mm4, MMWORD [wk(4)] ; mm4=tmp11L
+ movq mm3, MMWORD [wk(5)] ; mm3=tmp11H
+
+ movq mm7, mm4
+ movq mm0, mm3
+ paddd mm4, mm1 ; mm4=data1L
+ paddd mm3, mm6 ; mm3=data1H
+ psubd mm7, mm1 ; mm7=data6L
+ psubd mm0, mm6 ; mm0=data6H
+
+ movq mm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1]
+
+ paddd mm4, mm1
+ paddd mm3, mm1
+ psrad mm4, DESCALE_P1
+ psrad mm3, DESCALE_P1
+ paddd mm7, mm1
+ paddd mm0, mm1
+ psrad mm7, DESCALE_P1
+ psrad mm0, DESCALE_P1
+
+ packssdw mm4, mm3 ; mm4=data1=(10 11 12 13)
+ packssdw mm7, mm0 ; mm7=data6=(60 61 62 63)
+
+ movq mm6, mm5 ; transpose coefficients(phase 1)
+ punpcklwd mm5, mm4 ; mm5=(00 10 01 11)
+ punpckhwd mm6, mm4 ; mm6=(02 12 03 13)
+ movq mm1, mm7 ; transpose coefficients(phase 1)
+ punpcklwd mm7, mm2 ; mm7=(60 70 61 71)
+ punpckhwd mm1, mm2 ; mm1=(62 72 63 73)
+
+ movq mm3, MMWORD [wk(6)] ; mm3=tmp12L
+ movq mm0, MMWORD [wk(7)] ; mm0=tmp12H
+ movq mm4, MMWORD [wk(10)] ; mm4=tmp1L
+ movq mm2, MMWORD [wk(11)] ; mm2=tmp1H
+
+ movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11)
+ movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13)
+ movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71)
+ movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73)
+
+ movq mm5, mm3
+ movq mm6, mm0
+ paddd mm3, mm4 ; mm3=data2L
+ paddd mm0, mm2 ; mm0=data2H
+ psubd mm5, mm4 ; mm5=data5L
+ psubd mm6, mm2 ; mm6=data5H
+
+ movq mm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1]
+
+ paddd mm3, mm7
+ paddd mm0, mm7
+ psrad mm3, DESCALE_P1
+ psrad mm0, DESCALE_P1
+ paddd mm5, mm7
+ paddd mm6, mm7
+ psrad mm5, DESCALE_P1
+ psrad mm6, DESCALE_P1
+
+ packssdw mm3, mm0 ; mm3=data2=(20 21 22 23)
+ packssdw mm5, mm6 ; mm5=data5=(50 51 52 53)
+
+ movq mm1, MMWORD [wk(2)] ; mm1=tmp13L
+ movq mm4, MMWORD [wk(3)] ; mm4=tmp13H
+ movq mm2, MMWORD [wk(8)] ; mm2=tmp0L
+ movq mm7, MMWORD [wk(9)] ; mm7=tmp0H
+
+ movq mm0, mm1
+ movq mm6, mm4
+ paddd mm1, mm2 ; mm1=data3L
+ paddd mm4, mm7 ; mm4=data3H
+ psubd mm0, mm2 ; mm0=data4L
+ psubd mm6, mm7 ; mm6=data4H
+
+ movq mm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1]
+
+ paddd mm1, mm2
+ paddd mm4, mm2
+ psrad mm1, DESCALE_P1
+ psrad mm4, DESCALE_P1
+ paddd mm0, mm2
+ paddd mm6, mm2
+ psrad mm0, DESCALE_P1
+ psrad mm6, DESCALE_P1
+
+ packssdw mm1, mm4 ; mm1=data3=(30 31 32 33)
+ packssdw mm0, mm6 ; mm0=data4=(40 41 42 43)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11)
+ movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13)
+
+ movq mm4, mm3 ; transpose coefficients(phase 1)
+ punpcklwd mm3, mm1 ; mm3=(20 30 21 31)
+ punpckhwd mm4, mm1 ; mm4=(22 32 23 33)
+ movq mm6, mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0, mm5 ; mm0=(40 50 41 51)
+ punpckhwd mm6, mm5 ; mm6=(42 52 43 53)
+
+ movq mm1, mm7 ; transpose coefficients(phase 2)
+ punpckldq mm7, mm3 ; mm7=(00 10 20 30)
+ punpckhdq mm1, mm3 ; mm1=(01 11 21 31)
+ movq mm5, mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2, mm4 ; mm2=(02 12 22 32)
+ punpckhdq mm5, mm4 ; mm5=(03 13 23 33)
+
+ movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71)
+ movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+ movq mm7, mm0 ; transpose coefficients(phase 2)
+ punpckldq mm0, mm3 ; mm0=(40 50 60 70)
+ punpckhdq mm7, mm3 ; mm7=(41 51 61 71)
+ movq mm1, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm4 ; mm6=(42 52 62 72)
+ punpckhdq mm1, mm4 ; mm1=(43 53 63 73)
+
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
+ add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; JCOEF *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.rowloop:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movq mm4, mm1 ; mm1=in2=z2
+ movq mm5, mm1
+ punpcklwd mm4, mm3 ; mm3=in6=z3
+ punpckhwd mm5, mm3
+ movq mm1, mm4
+ movq mm3, mm5
+ pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L
+ pmaddwd mm5, [GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H
+ pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L
+ pmaddwd mm3, [GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H
+
+ movq mm6, mm0
+ paddw mm0, mm2 ; mm0=in0+in4
+ psubw mm6, mm2 ; mm6=in0-in4
+
+ pxor mm7, mm7
+ pxor mm2, mm2
+ punpcklwd mm7, mm0 ; mm7=tmp0L
+ punpckhwd mm2, mm0 ; mm2=tmp0H
+ psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
+ psrad mm2, (16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+ movq mm0, mm7
+ paddd mm7, mm4 ; mm7=tmp10L
+ psubd mm0, mm4 ; mm0=tmp13L
+ movq mm4, mm2
+ paddd mm2, mm5 ; mm2=tmp10H
+ psubd mm4, mm5 ; mm4=tmp13H
+
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L
+ movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H
+ movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L
+ movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H
+
+ pxor mm5, mm5
+ pxor mm7, mm7
+ punpcklwd mm5, mm6 ; mm5=tmp1L
+ punpckhwd mm7, mm6 ; mm7=tmp1H
+ psrad mm5, (16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS
+ psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+ movq mm2, mm5
+ paddd mm5, mm1 ; mm5=tmp11L
+ psubd mm2, mm1 ; mm2=tmp12L
+ movq mm0, mm7
+ paddd mm7, mm3 ; mm7=tmp11H
+ psubd mm0, mm3 ; mm0=tmp12H
+
+ movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L
+ movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H
+ movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L
+ movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ movq mm5, mm6
+ movq mm7, mm4
+ paddw mm5, mm3 ; mm5=z3
+ paddw mm7, mm1 ; mm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movq mm2, mm5
+ movq mm0, mm5
+ punpcklwd mm2, mm7
+ punpckhwd mm0, mm7
+ movq mm5, mm2
+ movq mm7, mm0
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L
+ pmaddwd mm0, [GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H
+ pmaddwd mm5, [GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L
+ pmaddwd mm7, [GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H
+
+ movq MMWORD [wk(10)], mm2 ; wk(10)=z3L
+ movq MMWORD [wk(11)], mm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movq mm2, mm3
+ movq mm0, mm3
+ punpcklwd mm2, mm4
+ punpckhwd mm0, mm4
+ movq mm3, mm2
+ movq mm4, mm0
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L
+ pmaddwd mm0, [GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H
+ pmaddwd mm3, [GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L
+ pmaddwd mm4, [GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H
+
+ paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L
+ paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H
+ paddd mm3, mm5 ; mm3=tmp3L
+ paddd mm4, mm7 ; mm4=tmp3H
+
+ movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L
+ movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H
+
+ movq mm2, mm1
+ movq mm0, mm1
+ punpcklwd mm2, mm6
+ punpckhwd mm0, mm6
+ movq mm1, mm2
+ movq mm6, mm0
+ pmaddwd mm2, [GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L
+ pmaddwd mm0, [GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H
+ pmaddwd mm1, [GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L
+ pmaddwd mm6, [GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H
+
+ paddd mm2, mm5 ; mm2=tmp1L
+ paddd mm0, mm7 ; mm0=tmp1H
+ paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L
+ paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H
+
+ movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L
+ movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movq mm5, MMWORD [wk(0)] ; mm5=tmp10L
+ movq mm7, MMWORD [wk(1)] ; mm7=tmp10H
+
+ movq mm2, mm5
+ movq mm0, mm7
+ paddd mm5, mm3 ; mm5=data0L
+ paddd mm7, mm4 ; mm7=data0H
+ psubd mm2, mm3 ; mm2=data7L
+ psubd mm0, mm4 ; mm0=data7H
+
+ movq mm3, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2]
+
+ paddd mm5, mm3
+ paddd mm7, mm3
+ psrad mm5, DESCALE_P2
+ psrad mm7, DESCALE_P2
+ paddd mm2, mm3
+ paddd mm0, mm3
+ psrad mm2, DESCALE_P2
+ psrad mm0, DESCALE_P2
+
+ packssdw mm5, mm7 ; mm5=data0=(00 10 20 30)
+ packssdw mm2, mm0 ; mm2=data7=(07 17 27 37)
+
+ movq mm4, MMWORD [wk(4)] ; mm4=tmp11L
+ movq mm3, MMWORD [wk(5)] ; mm3=tmp11H
+
+ movq mm7, mm4
+ movq mm0, mm3
+ paddd mm4, mm1 ; mm4=data1L
+ paddd mm3, mm6 ; mm3=data1H
+ psubd mm7, mm1 ; mm7=data6L
+ psubd mm0, mm6 ; mm0=data6H
+
+ movq mm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2]
+
+ paddd mm4, mm1
+ paddd mm3, mm1
+ psrad mm4, DESCALE_P2
+ psrad mm3, DESCALE_P2
+ paddd mm7, mm1
+ paddd mm0, mm1
+ psrad mm7, DESCALE_P2
+ psrad mm0, DESCALE_P2
+
+ packssdw mm4, mm3 ; mm4=data1=(01 11 21 31)
+ packssdw mm7, mm0 ; mm7=data6=(06 16 26 36)
+
+ packsswb mm5, mm7 ; mm5=(00 10 20 30 06 16 26 36)
+ packsswb mm4, mm2 ; mm4=(01 11 21 31 07 17 27 37)
+
+ movq mm6, MMWORD [wk(6)] ; mm6=tmp12L
+ movq mm1, MMWORD [wk(7)] ; mm1=tmp12H
+ movq mm3, MMWORD [wk(10)] ; mm3=tmp1L
+ movq mm0, MMWORD [wk(11)] ; mm0=tmp1H
+
+ movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36)
+ movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37)
+
+ movq mm7, mm6
+ movq mm2, mm1
+ paddd mm6, mm3 ; mm6=data2L
+ paddd mm1, mm0 ; mm1=data2H
+ psubd mm7, mm3 ; mm7=data5L
+ psubd mm2, mm0 ; mm2=data5H
+
+ movq mm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2]
+
+ paddd mm6, mm5
+ paddd mm1, mm5
+ psrad mm6, DESCALE_P2
+ psrad mm1, DESCALE_P2
+ paddd mm7, mm5
+ paddd mm2, mm5
+ psrad mm7, DESCALE_P2
+ psrad mm2, DESCALE_P2
+
+ packssdw mm6, mm1 ; mm6=data2=(02 12 22 32)
+ packssdw mm7, mm2 ; mm7=data5=(05 15 25 35)
+
+ movq mm4, MMWORD [wk(2)] ; mm4=tmp13L
+ movq mm3, MMWORD [wk(3)] ; mm3=tmp13H
+ movq mm0, MMWORD [wk(8)] ; mm0=tmp0L
+ movq mm5, MMWORD [wk(9)] ; mm5=tmp0H
+
+ movq mm1, mm4
+ movq mm2, mm3
+ paddd mm4, mm0 ; mm4=data3L
+ paddd mm3, mm5 ; mm3=data3H
+ psubd mm1, mm0 ; mm1=data4L
+ psubd mm2, mm5 ; mm2=data4H
+
+ movq mm0, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2]
+
+ paddd mm4, mm0
+ paddd mm3, mm0
+ psrad mm4, DESCALE_P2
+ psrad mm3, DESCALE_P2
+ paddd mm1, mm0
+ paddd mm2, mm0
+ psrad mm1, DESCALE_P2
+ psrad mm2, DESCALE_P2
+
+ movq mm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP]
+
+ packssdw mm4, mm3 ; mm4=data3=(03 13 23 33)
+ packssdw mm1, mm2 ; mm1=data4=(04 14 24 34)
+
+ movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36)
+ movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37)
+
+ packsswb mm6, mm1 ; mm6=(02 12 22 32 04 14 24 34)
+ packsswb mm4, mm7 ; mm4=(03 13 23 33 05 15 25 35)
+
+ paddb mm0, mm5
+ paddb mm3, mm5
+ paddb mm6, mm5
+ paddb mm4, mm5
+
+ movq mm2, mm0 ; transpose coefficients(phase 1)
+ punpcklbw mm0, mm3 ; mm0=(00 01 10 11 20 21 30 31)
+ punpckhbw mm2, mm3 ; mm2=(06 07 16 17 26 27 36 37)
+ movq mm1, mm6 ; transpose coefficients(phase 1)
+ punpcklbw mm6, mm4 ; mm6=(02 03 12 13 22 23 32 33)
+ punpckhbw mm1, mm4 ; mm1=(04 05 14 15 24 25 34 35)
+
+ movq mm7, mm0 ; transpose coefficients(phase 2)
+ punpcklwd mm0, mm6 ; mm0=(00 01 02 03 10 11 12 13)
+ punpckhwd mm7, mm6 ; mm7=(20 21 22 23 30 31 32 33)
+ movq mm5, mm1 ; transpose coefficients(phase 2)
+ punpcklwd mm1, mm2 ; mm1=(04 05 06 07 14 15 16 17)
+ punpckhwd mm5, mm2 ; mm5=(24 25 26 27 34 35 36 37)
+
+ movq mm3, mm0 ; transpose coefficients(phase 3)
+ punpckldq mm0, mm1 ; mm0=(00 01 02 03 04 05 06 07)
+ punpckhdq mm3, mm1 ; mm3=(10 11 12 13 14 15 16 17)
+ movq mm4, mm7 ; transpose coefficients(phase 3)
+ punpckldq mm7, mm5 ; mm7=(20 21 22 23 24 25 26 27)
+ punpckhdq mm4, mm5 ; mm4=(30 31 32 33 34 35 36 37)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_JCOEF ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctint-sse2.asm b/media/libjpeg/simd/i386/jidctint-sse2.asm
new file mode 100644
index 0000000000..43e320189b
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-sse2.asm
@@ -0,0 +1,858 @@
+;
+; jidctint.asm - accurate integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 12
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm1, xmm0
+ packsswb xmm1, xmm1
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm5, PASS1_BITS
+
+ movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+ jmp near .column_end
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm4, xmm1 ; xmm1=in2=z2
+ movdqa xmm5, xmm1
+ punpcklwd xmm4, xmm3 ; xmm3=in6=z3
+ punpckhwd xmm5, xmm3
+ movdqa xmm1, xmm4
+ movdqa xmm3, xmm5
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
+ pmaddwd xmm3, [GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H
+
+ movdqa xmm6, xmm0
+ paddw xmm0, xmm2 ; xmm0=in0+in4
+ psubw xmm6, xmm2 ; xmm6=in0-in4
+
+ pxor xmm7, xmm7
+ pxor xmm2, xmm2
+ punpcklwd xmm7, xmm0 ; xmm7=tmp0L
+ punpckhwd xmm2, xmm0 ; xmm2=tmp0H
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+ psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+ movdqa xmm0, xmm7
+ paddd xmm7, xmm4 ; xmm7=tmp10L
+ psubd xmm0, xmm4 ; xmm0=tmp13L
+ movdqa xmm4, xmm2
+ paddd xmm2, xmm5 ; xmm2=tmp10H
+ psubd xmm4, xmm5 ; xmm4=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
+
+ pxor xmm5, xmm5
+ pxor xmm7, xmm7
+ punpcklwd xmm5, xmm6 ; xmm5=tmp1L
+ punpckhwd xmm7, xmm6 ; xmm7=tmp1H
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+ movdqa xmm2, xmm5
+ paddd xmm5, xmm1 ; xmm5=tmp11L
+ psubd xmm2, xmm1 ; xmm2=tmp12L
+ movdqa xmm0, xmm7
+ paddd xmm7, xmm3 ; xmm7=tmp11H
+ psubd xmm0, xmm3 ; xmm0=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm5, xmm6
+ movdqa xmm7, xmm4
+ paddw xmm5, xmm3 ; xmm5=z3
+ paddw xmm7, xmm1 ; xmm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm2, xmm5
+ movdqa xmm0, xmm5
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm0, xmm7
+ movdqa xmm5, xmm2
+ movdqa xmm7, xmm0
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm2, xmm3
+ movdqa xmm0, xmm3
+ punpcklwd xmm2, xmm4
+ punpckhwd xmm0, xmm4
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm0
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H
+ pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H
+
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
+ paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
+ paddd xmm3, xmm5 ; xmm3=tmp3L
+ paddd xmm4, xmm7 ; xmm4=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
+
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm1
+ punpcklwd xmm2, xmm6
+ punpckhwd xmm0, xmm6
+ movdqa xmm1, xmm2
+ movdqa xmm6, xmm0
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
+
+ paddd xmm2, xmm5 ; xmm2=tmp1L
+ paddd xmm0, xmm7 ; xmm0=tmp1H
+ paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
+
+ movdqa xmm2, xmm5
+ movdqa xmm0, xmm7
+ paddd xmm5, xmm3 ; xmm5=data0L
+ paddd xmm7, xmm4 ; xmm7=data0H
+ psubd xmm2, xmm3 ; xmm2=data7L
+ psubd xmm0, xmm4 ; xmm0=data7H
+
+ movdqa xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1]
+
+ paddd xmm5, xmm3
+ paddd xmm7, xmm3
+ psrad xmm5, DESCALE_P1
+ psrad xmm7, DESCALE_P1
+ paddd xmm2, xmm3
+ paddd xmm0, xmm3
+ psrad xmm2, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+ movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
+ movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
+
+ movdqa xmm7, xmm4
+ movdqa xmm0, xmm3
+ paddd xmm4, xmm1 ; xmm4=data1L
+ paddd xmm3, xmm6 ; xmm3=data1H
+ psubd xmm7, xmm1 ; xmm7=data6L
+ psubd xmm0, xmm6 ; xmm0=data6H
+
+ movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1]
+
+ paddd xmm4, xmm1
+ paddd xmm3, xmm1
+ psrad xmm4, DESCALE_P1
+ psrad xmm3, DESCALE_P1
+ paddd xmm7, xmm1
+ paddd xmm0, xmm1
+ psrad xmm7, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
+ movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
+ movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
+ movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm0
+ paddd xmm3, xmm4 ; xmm3=data2L
+ paddd xmm0, xmm2 ; xmm0=data2H
+ psubd xmm5, xmm4 ; xmm5=data5L
+ psubd xmm6, xmm2 ; xmm6=data5H
+
+ movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1]
+
+ paddd xmm3, xmm7
+ paddd xmm0, xmm7
+ psrad xmm3, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+ paddd xmm5, xmm7
+ paddd xmm6, xmm7
+ psrad xmm5, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+
+ packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
+ packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
+ movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
+ movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
+ movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
+
+ movdqa xmm0, xmm1
+ movdqa xmm6, xmm4
+ paddd xmm1, xmm2 ; xmm1=data3L
+ paddd xmm4, xmm7 ; xmm4=data3H
+ psubd xmm0, xmm2 ; xmm0=data4L
+ psubd xmm6, xmm7 ; xmm6=data4H
+
+ movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1]
+
+ paddd xmm1, xmm2
+ paddd xmm4, xmm2
+ psrad xmm1, DESCALE_P1
+ psrad xmm4, DESCALE_P1
+ paddd xmm0, xmm2
+ paddd xmm6, xmm2
+ psrad xmm0, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+
+ packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
+ packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
+ movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
+
+ movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
+ punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
+ movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
+
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Even part
+
+ ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm6, xmm1 ; xmm1=in2=z2
+ movdqa xmm5, xmm1
+ punpcklwd xmm6, xmm2 ; xmm2=in6=z3
+ punpckhwd xmm5, xmm2
+ movdqa xmm1, xmm6
+ movdqa xmm2, xmm5
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H
+
+ movdqa xmm3, xmm7
+ paddw xmm7, xmm0 ; xmm7=in0+in4
+ psubw xmm3, xmm0 ; xmm3=in0-in4
+
+ pxor xmm4, xmm4
+ pxor xmm0, xmm0
+ punpcklwd xmm4, xmm7 ; xmm4=tmp0L
+ punpckhwd xmm0, xmm7 ; xmm0=tmp0H
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+ psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm6 ; xmm4=tmp10L
+ psubd xmm7, xmm6 ; xmm7=tmp13L
+ movdqa xmm6, xmm0
+ paddd xmm0, xmm5 ; xmm0=tmp10H
+ psubd xmm6, xmm5 ; xmm6=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
+
+ pxor xmm5, xmm5
+ pxor xmm4, xmm4
+ punpcklwd xmm5, xmm3 ; xmm5=tmp1L
+ punpckhwd xmm4, xmm3 ; xmm4=tmp1H
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+ movdqa xmm0, xmm5
+ paddd xmm5, xmm1 ; xmm5=tmp11L
+ psubd xmm0, xmm1 ; xmm0=tmp12L
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm2 ; xmm4=tmp11H
+ psubd xmm7, xmm2 ; xmm7=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
+ movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
+ movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
+ movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
+
+ movdqa xmm5, xmm6
+ movdqa xmm4, xmm3
+ paddw xmm5, xmm1 ; xmm5=z3
+ paddw xmm4, xmm2 ; xmm4=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm0, xmm5
+ movdqa xmm7, xmm5
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm7, xmm4
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm7
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm0, xmm1
+ movdqa xmm7, xmm1
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm7
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L
+ pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H
+
+ paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
+ paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
+ paddd xmm1, xmm5 ; xmm1=tmp3L
+ paddd xmm3, xmm4 ; xmm3=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
+
+ movdqa xmm0, xmm2
+ movdqa xmm7, xmm2
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm7, xmm6
+ movdqa xmm2, xmm0
+ movdqa xmm6, xmm7
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
+
+ paddd xmm0, xmm5 ; xmm0=tmp1L
+ paddd xmm7, xmm4 ; xmm7=tmp1H
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
+
+ movdqa xmm0, xmm5
+ movdqa xmm7, xmm4
+ paddd xmm5, xmm1 ; xmm5=data0L
+ paddd xmm4, xmm3 ; xmm4=data0H
+ psubd xmm0, xmm1 ; xmm0=data7L
+ psubd xmm7, xmm3 ; xmm7=data7H
+
+ movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2]
+
+ paddd xmm5, xmm1
+ paddd xmm4, xmm1
+ psrad xmm5, DESCALE_P2
+ psrad xmm4, DESCALE_P2
+ paddd xmm0, xmm1
+ paddd xmm7, xmm1
+ psrad xmm0, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
+ packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
+ movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
+
+ movdqa xmm4, xmm3
+ movdqa xmm7, xmm1
+ paddd xmm3, xmm2 ; xmm3=data1L
+ paddd xmm1, xmm6 ; xmm1=data1H
+ psubd xmm4, xmm2 ; xmm4=data6L
+ psubd xmm7, xmm6 ; xmm7=data6H
+
+ movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2]
+
+ paddd xmm3, xmm2
+ paddd xmm1, xmm2
+ psrad xmm3, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm4, xmm2
+ paddd xmm7, xmm2
+ psrad xmm4, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+ packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
+ movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
+ movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm4, xmm6
+ movdqa xmm0, xmm2
+ paddd xmm6, xmm1 ; xmm6=data2L
+ paddd xmm2, xmm7 ; xmm2=data2H
+ psubd xmm4, xmm1 ; xmm4=data5L
+ psubd xmm0, xmm7 ; xmm0=data5H
+
+ movdqa xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2]
+
+ paddd xmm6, xmm5
+ paddd xmm2, xmm5
+ psrad xmm6, DESCALE_P2
+ psrad xmm2, DESCALE_P2
+ paddd xmm4, xmm5
+ paddd xmm0, xmm5
+ psrad xmm4, DESCALE_P2
+ psrad xmm0, DESCALE_P2
+
+ packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
+ packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+ movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
+ movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
+ movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
+ movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
+
+ movdqa xmm2, xmm3
+ movdqa xmm0, xmm1
+ paddd xmm3, xmm7 ; xmm3=data3L
+ paddd xmm1, xmm5 ; xmm1=data3H
+ psubd xmm2, xmm7 ; xmm2=data4L
+ psubd xmm0, xmm5 ; xmm0=data4H
+
+ movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2]
+
+ paddd xmm3, xmm7
+ paddd xmm1, xmm7
+ psrad xmm3, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm2, xmm7
+ paddd xmm0, xmm7
+ psrad xmm2, DESCALE_P2
+ psrad xmm0, DESCALE_P2
+
+ movdqa xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP]
+
+ packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
+ packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm7, xmm5
+ paddb xmm1, xmm5
+ paddb xmm6, xmm5
+ paddb xmm3, xmm5
+
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
+ punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
+ punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
+ punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+ mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+ mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctred-mmx.asm b/media/libjpeg/simd/i386/jidctred-mmx.asm
new file mode 100644
index 0000000000..e2307e1cb6
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctred-mmx.asm
@@ -0,0 +1,704 @@
+;
+; jidctred.asm - reduced-size IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ 1730 ; FIX(0.211164243)
+F_0_509 equ 4176 ; FIX(0.509795579)
+F_0_601 equ 4926 ; FIX(0.601344887)
+F_0_720 equ 5906 ; FIX(0.720959822)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_850 equ 6967 ; FIX(0.850430095)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_061 equ 8697 ; FIX(1.061594337)
+F_1_272 equ 10426 ; FIX(1.272758580)
+F_1_451 equ 11893 ; FIX(1.451774981)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_2_172 equ 17799 ; FIX(2.172734803)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_624 equ 29692 ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076 times 2 dw F_1_847, -F_0_765
+PW_F256_F089 times 2 dw F_2_562, F_0_899
+PW_F106_MF217 times 2 dw F_1_061, -F_2_172
+PW_MF060_MF050 times 2 dw -F_0_601, -F_0_509
+PW_F145_MF021 times 2 dw F_1_451, -F_0_211
+PW_F362_MF127 times 2 dw F_3_624, -F_1_272
+PW_F085_MF072 times 2 dw F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_mmx(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+ ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+ ; JCOEF workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_4x4_mmx)
+
+EXTN(jsimd_idct_4x4_mmx):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; JCOEF *wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por mm0, mm1
+ packsswb mm0, mm0
+ movd eax, mm0
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw mm0, PASS1_BITS
+
+ movq mm2, mm0 ; mm0=in0=(00 01 02 03)
+ punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
+ punpckhwd mm2, mm2 ; mm2=(02 02 03 03)
+
+ movq mm1, mm0
+ punpckldq mm0, mm0 ; mm0=(00 00 00 00)
+ punpckhdq mm1, mm1 ; mm1=(01 01 01 01)
+ movq mm3, mm2
+ punpckldq mm2, mm2 ; mm2=(02 02 02 02)
+ punpckhdq mm3, mm3 ; mm3=(03 03 03 03)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+ jmp near .nextcolumn
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movq mm4, mm0
+ movq mm5, mm0
+ punpcklwd mm4, mm1
+ punpckhwd mm5, mm1
+ movq mm0, mm4
+ movq mm1, mm5
+ pmaddwd mm4, [GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
+ pmaddwd mm5, [GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
+ pmaddwd mm0, [GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
+ pmaddwd mm1, [GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
+
+ movq mm6, mm2
+ movq mm7, mm2
+ punpcklwd mm6, mm3
+ punpckhwd mm7, mm3
+ movq mm2, mm6
+ movq mm3, mm7
+ pmaddwd mm6, [GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
+ pmaddwd mm7, [GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
+ pmaddwd mm2, [GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
+ pmaddwd mm3, [GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
+
+ paddd mm6, mm4 ; mm6=tmp2L
+ paddd mm7, mm5 ; mm7=tmp2H
+ paddd mm2, mm0 ; mm2=tmp0L
+ paddd mm3, mm1 ; mm3=tmp0H
+
+ movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
+ movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklwd mm1, mm4 ; mm1=tmp0L
+ punpckhwd mm2, mm4 ; mm2=tmp0H
+ psrad mm1, (16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+ psrad mm2, (16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+ movq mm3, mm5 ; mm5=in2=z2
+ punpcklwd mm5, mm0 ; mm0=in6=z3
+ punpckhwd mm3, mm0
+ pmaddwd mm5, [GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
+ pmaddwd mm3, [GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
+
+ movq mm4, mm1
+ movq mm0, mm2
+ paddd mm1, mm5 ; mm1=tmp10L
+ paddd mm2, mm3 ; mm2=tmp10H
+ psubd mm4, mm5 ; mm4=tmp12L
+ psubd mm0, mm3 ; mm0=tmp12H
+
+ ; -- Final output stage
+
+ movq mm5, mm1
+ movq mm3, mm2
+ paddd mm1, mm6 ; mm1=data0L
+ paddd mm2, mm7 ; mm2=data0H
+ psubd mm5, mm6 ; mm5=data3L
+ psubd mm3, mm7 ; mm3=data3H
+
+ movq mm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4]
+
+ paddd mm1, mm6
+ paddd mm2, mm6
+ psrad mm1, DESCALE_P1_4
+ psrad mm2, DESCALE_P1_4
+ paddd mm5, mm6
+ paddd mm3, mm6
+ psrad mm5, DESCALE_P1_4
+ psrad mm3, DESCALE_P1_4
+
+ packssdw mm1, mm2 ; mm1=data0=(00 01 02 03)
+ packssdw mm5, mm3 ; mm5=data3=(30 31 32 33)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
+ movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
+
+ movq mm2, mm4
+ movq mm3, mm0
+ paddd mm4, mm7 ; mm4=data1L
+ paddd mm0, mm6 ; mm0=data1H
+ psubd mm2, mm7 ; mm2=data2L
+ psubd mm3, mm6 ; mm3=data2H
+
+ movq mm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4]
+
+ paddd mm4, mm7
+ paddd mm0, mm7
+ psrad mm4, DESCALE_P1_4
+ psrad mm0, DESCALE_P1_4
+ paddd mm2, mm7
+ paddd mm3, mm7
+ psrad mm2, DESCALE_P1_4
+ psrad mm3, DESCALE_P1_4
+
+ packssdw mm4, mm0 ; mm4=data1=(10 11 12 13)
+ packssdw mm2, mm3 ; mm2=data2=(20 21 22 23)
+
+ movq mm6, mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1, mm4 ; mm1=(00 10 01 11)
+ punpckhwd mm6, mm4 ; mm6=(02 12 03 13)
+ movq mm7, mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2, mm5 ; mm2=(20 30 21 31)
+ punpckhwd mm7, mm5 ; mm7=(22 32 23 33)
+
+ movq mm0, mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1, mm2 ; mm1=(00 10 20 30)
+ punpckhdq mm0, mm2 ; mm0=(01 11 21 31)
+ movq mm3, mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6, mm7 ; mm6=(02 12 22 32)
+ punpckhdq mm3, mm7 ; mm3=(03 13 23 33)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
+ add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; JCOEF *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ movq mm4, mm0
+ movq mm5, mm0
+ punpcklwd mm4, mm1
+ punpckhwd mm5, mm1
+ movq mm0, mm4
+ movq mm1, mm5
+ pmaddwd mm4, [GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
+ pmaddwd mm5, [GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
+ pmaddwd mm0, [GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
+ pmaddwd mm1, [GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
+
+ movq mm6, mm2
+ movq mm7, mm2
+ punpcklwd mm6, mm3
+ punpckhwd mm7, mm3
+ movq mm2, mm6
+ movq mm3, mm7
+ pmaddwd mm6, [GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
+ pmaddwd mm7, [GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
+ pmaddwd mm2, [GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
+ pmaddwd mm3, [GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
+
+ paddd mm6, mm4 ; mm6=tmp2L
+ paddd mm7, mm5 ; mm7=tmp2H
+ paddd mm2, mm0 ; mm2=tmp0L
+ paddd mm3, mm1 ; mm3=tmp0H
+
+ movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
+ movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklwd mm1, mm4 ; mm1=tmp0L
+ punpckhwd mm2, mm4 ; mm2=tmp0H
+ psrad mm1, (16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+ psrad mm2, (16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+ movq mm3, mm5 ; mm5=in2=z2
+ punpcklwd mm5, mm0 ; mm0=in6=z3
+ punpckhwd mm3, mm0
+ pmaddwd mm5, [GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
+ pmaddwd mm3, [GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
+
+ movq mm4, mm1
+ movq mm0, mm2
+ paddd mm1, mm5 ; mm1=tmp10L
+ paddd mm2, mm3 ; mm2=tmp10H
+ psubd mm4, mm5 ; mm4=tmp12L
+ psubd mm0, mm3 ; mm0=tmp12H
+
+ ; -- Final output stage
+
+ movq mm5, mm1
+ movq mm3, mm2
+ paddd mm1, mm6 ; mm1=data0L
+ paddd mm2, mm7 ; mm2=data0H
+ psubd mm5, mm6 ; mm5=data3L
+ psubd mm3, mm7 ; mm3=data3H
+
+ movq mm6, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4]
+
+ paddd mm1, mm6
+ paddd mm2, mm6
+ psrad mm1, DESCALE_P2_4
+ psrad mm2, DESCALE_P2_4
+ paddd mm5, mm6
+ paddd mm3, mm6
+ psrad mm5, DESCALE_P2_4
+ psrad mm3, DESCALE_P2_4
+
+ packssdw mm1, mm2 ; mm1=data0=(00 10 20 30)
+ packssdw mm5, mm3 ; mm5=data3=(03 13 23 33)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
+ movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
+
+ movq mm2, mm4
+ movq mm3, mm0
+ paddd mm4, mm7 ; mm4=data1L
+ paddd mm0, mm6 ; mm0=data1H
+ psubd mm2, mm7 ; mm2=data2L
+ psubd mm3, mm6 ; mm3=data2H
+
+ movq mm7, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4]
+
+ paddd mm4, mm7
+ paddd mm0, mm7
+ psrad mm4, DESCALE_P2_4
+ psrad mm0, DESCALE_P2_4
+ paddd mm2, mm7
+ paddd mm3, mm7
+ psrad mm2, DESCALE_P2_4
+ psrad mm3, DESCALE_P2_4
+
+ packssdw mm4, mm0 ; mm4=data1=(01 11 21 31)
+ packssdw mm2, mm3 ; mm2=data2=(02 12 22 32)
+
+ movq mm6, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP]
+
+ packsswb mm1, mm2 ; mm1=(00 10 20 30 02 12 22 32)
+ packsswb mm4, mm5 ; mm4=(01 11 21 31 03 13 23 33)
+ paddb mm1, mm6
+ paddb mm4, mm6
+
+ movq mm7, mm1 ; transpose coefficients(phase 1)
+ punpcklbw mm1, mm4 ; mm1=(00 01 10 11 20 21 30 31)
+ punpckhbw mm7, mm4 ; mm7=(02 03 12 13 22 23 32 33)
+
+ movq mm0, mm1 ; transpose coefficients(phase 2)
+ punpcklwd mm1, mm7 ; mm1=(00 01 02 03 10 11 12 13)
+ punpckhwd mm0, mm7 ; mm0=(20 21 22 23 30 31 32 33)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movd dword [edx+eax*SIZEOF_JSAMPLE], mm1
+ movd dword [esi+eax*SIZEOF_JSAMPLE], mm0
+
+ psrlq mm1, 4*BYTE_BIT
+ psrlq mm0, 4*BYTE_BIT
+
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movd dword [edx+eax*SIZEOF_JSAMPLE], mm1
+ movd dword [esi+eax*SIZEOF_JSAMPLE], mm0
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_mmx(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_2x2_mmx)
+
+EXTN(jsimd_idct_2x2_mmx):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+ mov edx, POINTER [dct_table(ebp)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
+
+ ; | input: | result: |
+ ; | 00 01 ** 03 ** 05 ** 07 | |
+ ; | 10 11 ** 13 ** 15 ** 17 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+ ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+ ; | 50 51 ** 53 ** 55 ** 57 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 70 71 ** 73 ** 75 ** 77 | |
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+ ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+ pcmpeqd mm7, mm7
+ pslld mm7, WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+ movq mm4, mm0 ; mm4=(10 11 ** 13)
+ movq mm5, mm2 ; mm5=(50 51 ** 53)
+ punpcklwd mm4, mm1 ; mm4=(10 30 11 31)
+ punpcklwd mm5, mm3 ; mm5=(50 70 51 71)
+ pmaddwd mm4, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+ psrld mm0, WORD_BIT ; mm0=(11 -- 13 --)
+ pand mm1, mm7 ; mm1=(-- 31 -- 33)
+ psrld mm2, WORD_BIT ; mm2=(51 -- 53 --)
+ pand mm3, mm7 ; mm3=(-- 71 -- 73)
+ por mm0, mm1 ; mm0=(11 31 13 33)
+ por mm2, mm3 ; mm2=(51 71 53 73)
+ pmaddwd mm0, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd mm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd mm4, mm5 ; mm4=tmp0[col0 col1]
+
+ movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+ pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+ pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+ ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+ psrld mm6, WORD_BIT ; mm6=(15 -- 17 --)
+ pand mm1, mm7 ; mm1=(-- 35 -- 37)
+ psrld mm3, WORD_BIT ; mm3=(55 -- 57 --)
+ pand mm5, mm7 ; mm5=(-- 75 -- 77)
+ por mm6, mm1 ; mm6=(15 35 17 37)
+ por mm3, mm5 ; mm3=(55 75 57 77)
+ pmaddwd mm6, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd mm3, [GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd mm0, mm2 ; mm0=tmp0[col1 col3]
+ paddd mm6, mm3 ; mm6=tmp0[col5 col7]
+
+ ; -- Even part
+
+ movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+ pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+ movq mm2, mm1 ; mm2=(00 01 ** 03)
+ pslld mm1, WORD_BIT ; mm1=(-- 00 -- **)
+ psrad mm1, (WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****]
+
+ pand mm2, mm7 ; mm2=(-- 01 -- 03)
+ pand mm5, mm7 ; mm5=(-- 05 -- 07)
+ psrad mm2, (WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3]
+ psrad mm5, (WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7]
+
+ ; -- Final output stage
+
+ movq mm3, mm1
+ paddd mm1, mm4 ; mm1=data0[col0 ****]=(A0 **)
+ psubd mm3, mm4 ; mm3=data1[col0 ****]=(B0 **)
+ punpckldq mm1, mm3 ; mm1=(A0 B0)
+
+ movq mm7, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2]
+
+ movq mm4, mm2
+ movq mm3, mm5
+ paddd mm2, mm0 ; mm2=data0[col1 col3]=(A1 A3)
+ paddd mm5, mm6 ; mm5=data0[col5 col7]=(A5 A7)
+ psubd mm4, mm0 ; mm4=data1[col1 col3]=(B1 B3)
+ psubd mm3, mm6 ; mm3=data1[col5 col7]=(B5 B7)
+
+ paddd mm1, mm7
+ psrad mm1, DESCALE_P1_2
+
+ paddd mm2, mm7
+ paddd mm5, mm7
+ psrad mm2, DESCALE_P1_2
+ psrad mm5, DESCALE_P1_2
+ paddd mm4, mm7
+ paddd mm3, mm7
+ psrad mm4, DESCALE_P1_2
+ psrad mm3, DESCALE_P1_2
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(ebp)]
+
+ ; | input:| result:|
+ ; | A0 B0 | |
+ ; | A1 B1 | C0 C1 |
+ ; | A3 B3 | D0 D1 |
+ ; | A5 B5 | |
+ ; | A7 B7 | |
+
+ ; -- Odd part
+
+ packssdw mm2, mm4 ; mm2=(A1 A3 B1 B3)
+ packssdw mm5, mm3 ; mm5=(A5 A7 B5 B7)
+ pmaddwd mm2, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd mm2, mm5 ; mm2=tmp0[row0 row1]
+
+ ; -- Even part
+
+ pslld mm1, (CONST_BITS+2) ; mm1=tmp10[row0 row1]
+
+ ; -- Final output stage
+
+ movq mm0, [GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2]
+
+ movq mm6, mm1
+ paddd mm1, mm2 ; mm1=data0[row0 row1]=(C0 C1)
+ psubd mm6, mm2 ; mm6=data1[row0 row1]=(D0 D1)
+
+ paddd mm1, mm0
+ paddd mm6, mm0
+ psrad mm1, DESCALE_P2_2
+ psrad mm6, DESCALE_P2_2
+
+ movq mm7, mm1 ; transpose coefficients
+ punpckldq mm1, mm6 ; mm1=(C0 D0)
+ punpckhdq mm7, mm6 ; mm7=(C1 D1)
+
+ packssdw mm1, mm7 ; mm1=(C0 D0 C1 D1)
+ packsswb mm1, mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+ paddb mm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+ movd ecx, mm1
+ movd ebx, mm1 ; ebx=(C0 D0 C1 D1)
+ shr ecx, 2*BYTE_BIT ; ecx=(C1 D1 -- --)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov word [edx+eax*SIZEOF_JSAMPLE], bx
+ mov word [esi+eax*SIZEOF_JSAMPLE], cx
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jidctred-sse2.asm b/media/libjpeg/simd/i386/jidctred-sse2.asm
new file mode 100644
index 0000000000..6e56494e97
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctred-sse2.asm
@@ -0,0 +1,592 @@
+;
+; jidctred.asm - reduced-size IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ 1730 ; FIX(0.211164243)
+F_0_509 equ 4176 ; FIX(0.509795579)
+F_0_601 equ 4926 ; FIX(0.601344887)
+F_0_720 equ 5906 ; FIX(0.720959822)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_850 equ 6967 ; FIX(0.850430095)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_061 equ 8697 ; FIX(1.061594337)
+F_1_272 equ 10426 ; FIX(1.272758580)
+F_1_451 equ 11893 ; FIX(1.451774981)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_2_172 equ 17799 ; FIX(2.172734803)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_624 equ 29692 ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076 times 4 dw F_1_847, -F_0_765
+PW_F256_F089 times 4 dw F_2_562, F_0_899
+PW_F106_MF217 times 4 dw F_1_061, -F_2_172
+PW_MF060_MF050 times 4 dw -F_0_601, -F_0_509
+PW_F145_MF021 times 4 dw F_1_451, -F_0_211
+PW_F362_MF127 times 4 dw F_3_624, -F_1_272
+PW_F085_MF072 times 4 dw F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+%define original_ebp ebp + 0
+%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm0, xmm1
+ packsswb xmm0, xmm0
+ packsswb xmm0, xmm0
+ movd eax, xmm0
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm0, PASS1_BITS
+
+ movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+ pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+ pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+ pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+ jmp near .column_end
+ alignx 16, 7
+%endif
+.columnDCT:
+
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm0
+ punpcklwd xmm4, xmm1
+ punpckhwd xmm5, xmm1
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
+
+ movdqa xmm6, xmm2
+ movdqa xmm7, xmm2
+ punpcklwd xmm6, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm7
+ pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
+ pmaddwd xmm3, [GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
+
+ paddd xmm6, xmm4 ; xmm6=tmp2L
+ paddd xmm7, xmm5 ; xmm7=tmp2H
+ paddd xmm2, xmm0 ; xmm2=tmp0L
+ paddd xmm3, xmm1 ; xmm3=tmp0H
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ pxor xmm1, xmm1
+ pxor xmm2, xmm2
+ punpcklwd xmm1, xmm4 ; xmm1=tmp0L
+ punpckhwd xmm2, xmm4 ; xmm2=tmp0H
+ psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+ psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+ movdqa xmm3, xmm5 ; xmm5=in2=z2
+ punpcklwd xmm5, xmm0 ; xmm0=in6=z3
+ punpckhwd xmm3, xmm0
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
+ pmaddwd xmm3, [GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
+
+ movdqa xmm4, xmm1
+ movdqa xmm0, xmm2
+ paddd xmm1, xmm5 ; xmm1=tmp10L
+ paddd xmm2, xmm3 ; xmm2=tmp10H
+ psubd xmm4, xmm5 ; xmm4=tmp12L
+ psubd xmm0, xmm3 ; xmm0=tmp12H
+
+ ; -- Final output stage
+
+ movdqa xmm5, xmm1
+ movdqa xmm3, xmm2
+ paddd xmm1, xmm6 ; xmm1=data0L
+ paddd xmm2, xmm7 ; xmm2=data0H
+ psubd xmm5, xmm6 ; xmm5=data3L
+ psubd xmm3, xmm7 ; xmm3=data3H
+
+ movdqa xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
+
+ paddd xmm1, xmm6
+ paddd xmm2, xmm6
+ psrad xmm1, DESCALE_P1_4
+ psrad xmm2, DESCALE_P1_4
+ paddd xmm5, xmm6
+ paddd xmm3, xmm6
+ psrad xmm5, DESCALE_P1_4
+ psrad xmm3, DESCALE_P1_4
+
+ packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
+
+ movdqa xmm2, xmm4
+ movdqa xmm3, xmm0
+ paddd xmm4, xmm7 ; xmm4=data1L
+ paddd xmm0, xmm6 ; xmm0=data1H
+ psubd xmm2, xmm7 ; xmm2=data2L
+ psubd xmm3, xmm6 ; xmm3=data2H
+
+ movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
+
+ paddd xmm4, xmm7
+ paddd xmm0, xmm7
+ psrad xmm4, DESCALE_P1_4
+ psrad xmm0, DESCALE_P1_4
+ paddd xmm2, xmm7
+ paddd xmm3, xmm7
+ psrad xmm2, DESCALE_P1_4
+ psrad xmm3, DESCALE_P1_4
+
+ packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+ movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Even part
+
+ pxor xmm4, xmm4
+ punpcklwd xmm4, xmm1 ; xmm4=tmp0
+ psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+ ; -- Odd part
+
+ punpckhwd xmm1, xmm0
+ punpckhwd xmm6, xmm3
+ movdqa xmm5, xmm1
+ movdqa xmm2, xmm6
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
+ pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
+
+ paddd xmm6, xmm1 ; xmm6=tmp2
+ paddd xmm2, xmm5 ; xmm2=tmp0
+
+ ; -- Even part
+
+ punpcklwd xmm0, xmm3
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
+
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm0 ; xmm4=tmp10
+ psubd xmm7, xmm0 ; xmm7=tmp12
+
+ ; -- Final output stage
+
+ movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
+
+ movdqa xmm5, xmm4
+ movdqa xmm3, xmm7
+ paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
+ paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
+ psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
+ psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
+
+ paddd xmm4, xmm1
+ paddd xmm7, xmm1
+ psrad xmm4, DESCALE_P2_4
+ psrad xmm7, DESCALE_P2_4
+ paddd xmm5, xmm1
+ paddd xmm3, xmm1
+ psrad xmm5, DESCALE_P2_4
+ psrad xmm3, DESCALE_P2_4
+
+ packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
+ packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
+
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
+ punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
+
+ packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+ paddb xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+ pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+ pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+ pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+ movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b) + 8 ; void *dct_table
+%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
+%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
+%define output_col(b) (b) + 20 ; JDIMENSION output_col
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+ mov edx, POINTER [dct_table(ebp)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
+
+ ; | input: | result: |
+ ; | 00 01 ** 03 ** 05 ** 07 | |
+ ; | 10 11 ** 13 ** 15 ** 17 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+ ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+ ; | 50 51 ** 53 ** 55 ** 57 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 70 71 ** 73 ** 75 ** 77 | |
+
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+ ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+ pcmpeqd xmm7, xmm7
+ pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+ movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
+ movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
+ punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
+ punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+ psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
+ pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+ psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
+ pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+ por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
+ por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
+ paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
+
+ ; -- Even part
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+ movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
+ pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
+ pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+ psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+ psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+ ; -- Final output stage
+
+ movdqa xmm3, xmm6
+ movdqa xmm5, xmm1
+ paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+ paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+ psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+ psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+ movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
+
+ punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
+
+ movdqa xmm7, xmm1
+ punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
+ punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
+
+ paddd xmm6, xmm2
+ psrad xmm6, DESCALE_P1_2
+
+ paddd xmm1, xmm2
+ paddd xmm7, xmm2
+ psrad xmm1, DESCALE_P1_2
+ psrad xmm7, DESCALE_P1_2
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(ebp)]
+
+ ; | input:| result:|
+ ; | A0 B0 | |
+ ; | A1 B1 | C0 C1 |
+ ; | A3 B3 | D0 D1 |
+ ; | A5 B5 | |
+ ; | A7 B7 | |
+
+ ; -- Odd part
+
+ packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+ packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd xmm7, [GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
+
+ ; -- Even part
+
+ pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
+
+ ; -- Final output stage
+
+ movdqa xmm4, xmm6
+ paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+ psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+ punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
+
+ paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
+ psrad xmm6, DESCALE_P2_2
+
+ packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+ packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+ paddb xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+ pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
+ pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov word [edx+eax*SIZEOF_JSAMPLE], bx
+ mov word [esi+eax*SIZEOF_JSAMPLE], cx
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jquant-3dn.asm b/media/libjpeg/simd/i386/jquant-3dn.asm
new file mode 100644
index 0000000000..5cb60caa94
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-3dn.asm
@@ -0,0 +1,230 @@
+;
+; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col,
+; FAST_FLOAT *workspace);
+;
+
+%define sample_data ebp + 8 ; JSAMPARRAY sample_data
+%define start_col ebp + 12 ; JDIMENSION start_col
+%define workspace ebp + 16 ; FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow)
+
+EXTN(jsimd_convsamp_float_3dnow):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pcmpeqw mm7, mm7
+ psllw mm7, 7
+ packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/2
+ alignx 16, 7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ psubb mm0, mm7 ; mm0=(01234567)
+ psubb mm1, mm7 ; mm1=(89ABCDEF)
+
+ punpcklbw mm2, mm0 ; mm2=(*0*1*2*3)
+ punpckhbw mm0, mm0 ; mm0=(*4*5*6*7)
+ punpcklbw mm3, mm1 ; mm3=(*8*9*A*B)
+ punpckhbw mm1, mm1 ; mm1=(*C*D*E*F)
+
+ punpcklwd mm4, mm2 ; mm4=(***0***1)
+ punpckhwd mm2, mm2 ; mm2=(***2***3)
+ punpcklwd mm5, mm0 ; mm5=(***4***5)
+ punpckhwd mm0, mm0 ; mm0=(***6***7)
+
+ psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01)
+ psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23)
+ pi2fd mm4, mm4
+ pi2fd mm2, mm2
+ psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45)
+ psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67)
+ pi2fd mm5, mm5
+ pi2fd mm0, mm0
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+ movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+ movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+ punpcklwd mm6, mm3 ; mm6=(***8***9)
+ punpckhwd mm3, mm3 ; mm3=(***A***B)
+ punpcklwd mm4, mm1 ; mm4=(***C***D)
+ punpckhwd mm1, mm1 ; mm1=(***E***F)
+
+ psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89)
+ psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB)
+ pi2fd mm6, mm6
+ pi2fd mm3, mm3
+ psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD)
+ psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF)
+ pi2fd mm4, mm4
+ pi2fd mm1, mm1
+
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+ movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+ add esi, byte 2*SIZEOF_JSAMPROW
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .convloop
+
+ femms ; empty MMX/3DNow! state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+; FAST_FLOAT *workspace);
+;
+
+%define coef_block ebp + 8 ; JCOEFPTR coef_block
+%define divisors ebp + 12 ; FAST_FLOAT *divisors
+%define workspace ebp + 16 ; FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_float_3dnow)
+
+EXTN(jsimd_quantize_float_3dnow):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic)
+ movd mm7, eax
+ punpckldq mm7, mm7 ; mm7={12582912.0F 12582912.0F}
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/16
+ alignx 16, 7
+.quantloop:
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+ pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+ pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+ pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+ pfadd mm0, mm7 ; mm0=(00 ** 01 **)
+ pfadd mm1, mm7 ; mm1=(02 ** 03 **)
+ pfadd mm2, mm7 ; mm0=(04 ** 05 **)
+ pfadd mm3, mm7 ; mm1=(06 ** 07 **)
+
+ movq mm4, mm0
+ punpcklwd mm0, mm1 ; mm0=(00 02 ** **)
+ punpckhwd mm4, mm1 ; mm4=(01 03 ** **)
+ movq mm5, mm2
+ punpcklwd mm2, mm3 ; mm2=(04 06 ** **)
+ punpckhwd mm5, mm3 ; mm5=(05 07 ** **)
+
+ punpcklwd mm0, mm4 ; mm0=(00 01 02 03)
+ punpcklwd mm2, mm5 ; mm2=(04 05 06 07)
+
+ movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+ pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+ movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+ pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+ pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+ pfadd mm6, mm7 ; mm0=(10 ** 11 **)
+ pfadd mm1, mm7 ; mm4=(12 ** 13 **)
+ pfadd mm3, mm7 ; mm0=(14 ** 15 **)
+ pfadd mm4, mm7 ; mm4=(16 ** 17 **)
+
+ movq mm5, mm6
+ punpcklwd mm6, mm1 ; mm6=(10 12 ** **)
+ punpckhwd mm5, mm1 ; mm5=(11 13 ** **)
+ movq mm1, mm3
+ punpcklwd mm3, mm4 ; mm3=(14 16 ** **)
+ punpckhwd mm1, mm4 ; mm1=(15 17 ** **)
+
+ punpcklwd mm6, mm5 ; mm6=(10 11 12 13)
+ punpcklwd mm3, mm1 ; mm3=(14 15 16 17)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+ add esi, byte 16*SIZEOF_FAST_FLOAT
+ add edx, byte 16*SIZEOF_FAST_FLOAT
+ add edi, byte 16*SIZEOF_JCOEF
+ dec eax
+ jnz near .quantloop
+
+ femms ; empty MMX/3DNow! state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jquant-mmx.asm b/media/libjpeg/simd/i386/jquant-mmx.asm
new file mode 100644
index 0000000000..61305c625d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-mmx.asm
@@ -0,0 +1,276 @@
+;
+; jquant.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM *workspace);
+;
+
+%define sample_data ebp + 8 ; JSAMPARRAY sample_data
+%define start_col ebp + 12 ; JDIMENSION start_col
+%define workspace ebp + 16 ; DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pxor mm6, mm6 ; mm6=(all 0's)
+ pcmpeqw mm7, mm7
+ psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567)
+ movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF)
+
+ mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN)
+ movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV)
+
+ movq mm4, mm0
+ punpcklbw mm0, mm6 ; mm0=(0123)
+ punpckhbw mm4, mm6 ; mm4=(4567)
+ movq mm5, mm1
+ punpcklbw mm1, mm6 ; mm1=(89AB)
+ punpckhbw mm5, mm6 ; mm5=(CDEF)
+
+ paddw mm0, mm7
+ paddw mm4, mm7
+ paddw mm1, mm7
+ paddw mm5, mm7
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+ movq mm0, mm2
+ punpcklbw mm2, mm6 ; mm2=(GHIJ)
+ punpckhbw mm0, mm6 ; mm0=(KLMN)
+ movq mm4, mm3
+ punpcklbw mm3, mm6 ; mm3=(OPQR)
+ punpckhbw mm4, mm6 ; mm4=(STUV)
+
+ paddw mm2, mm7
+ paddw mm0, mm7
+ paddw mm3, mm7
+ paddw mm4, mm7
+
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+ add esi, byte 4*SIZEOF_JSAMPROW
+ add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz short .convloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors,
+; DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+ MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+ MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+ MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SHIFT(m, n, b) \
+ MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block ebp + 8 ; JCOEFPTR coef_block
+%define divisors ebp + 12 ; DCTELEM *divisors
+%define workspace ebp + 16 ; DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov ah, 2
+ alignx 16, 7
+.quantloop1:
+ mov al, DCTSIZE2/8/2
+ alignx 16, 7
+.quantloop2:
+ movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+ movq mm0, mm2
+ movq mm1, mm3
+
+ psraw mm2, (WORD_BIT-1) ; -1 if value < 0, 0 otherwise
+ psraw mm3, (WORD_BIT-1)
+
+ pxor mm0, mm2 ; val = -val
+ pxor mm1, mm3
+ psubw mm0, mm2
+ psubw mm1, mm3
+
+ ;
+ ; MMX is an annoyingly crappy instruction set. It has two
+ ; misfeatures that are causing problems here:
+ ;
+ ; - All multiplications are signed.
+ ;
+ ; - The second operand for the shifts is not treated as packed.
+ ;
+ ;
+ ; We work around the first problem by implementing this algorithm:
+ ;
+ ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+ ; {
+ ; enum { SHORT_BIT = 16 };
+ ; signed short sx = (signed short)x;
+ ; signed short sy = (signed short)y;
+ ; signed long sz;
+ ;
+ ; sz = (long)sx * (long)sy; /* signed multiply */
+ ;
+ ; if (sx < 0) sz += (long)sy << SHORT_BIT;
+ ; if (sy < 0) sz += (long)sx << SHORT_BIT;
+ ;
+ ; return (unsigned long)sz;
+ ; }
+ ;
+ ; (note that a negative sx adds _sy_ and vice versa)
+ ;
+ ; For the second problem, we replace the shift by a multiplication.
+ ; Unfortunately that means we have to deal with the signed issue again.
+ ;
+
+ paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
+ paddw mm1, MMWORD [CORRECTION(0,1,edx)]
+
+ movq mm4, mm0 ; store current value for later
+ movq mm5, mm1
+ pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
+ pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]
+ paddw mm0, mm4 ; reciprocal is always negative (MSB=1),
+ paddw mm1, mm5 ; so we always need to add the initial value
+ ; (input value is never negative as we
+ ; inverted it at the start of this routine)
+
+ ; here it gets a bit tricky as both scale
+ ; and mm0/mm1 can be negative
+ movq mm6, MMWORD [SCALE(0,0,edx)] ; scale
+ movq mm7, MMWORD [SCALE(0,1,edx)]
+ movq mm4, mm0
+ movq mm5, mm1
+ pmulhw mm0, mm6
+ pmulhw mm1, mm7
+
+ psraw mm6, (WORD_BIT-1) ; determine if scale is negative
+ psraw mm7, (WORD_BIT-1)
+
+ pand mm6, mm4 ; and add input if it is
+ pand mm7, mm5
+ paddw mm0, mm6
+ paddw mm1, mm7
+
+ psraw mm4, (WORD_BIT-1) ; then check if negative input
+ psraw mm5, (WORD_BIT-1)
+
+ pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is
+ pand mm5, MMWORD [SCALE(0,1,edx)]
+ paddw mm0, mm4
+ paddw mm1, mm5
+
+ pxor mm0, mm2 ; val = -val
+ pxor mm1, mm3
+ psubw mm0, mm2
+ psubw mm1, mm3
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+ add esi, byte 8*SIZEOF_DCTELEM
+ add edx, byte 8*SIZEOF_DCTELEM
+ add edi, byte 8*SIZEOF_JCOEF
+ dec al
+ jnz near .quantloop2
+ dec ah
+ jnz near .quantloop1 ; to avoid branch misprediction
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jquant-sse.asm b/media/libjpeg/simd/i386/jquant-sse.asm
new file mode 100644
index 0000000000..218adc976f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-sse.asm
@@ -0,0 +1,208 @@
+;
+; jquant.asm - sample data conversion and quantization (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
+; FAST_FLOAT *workspace);
+;
+
+%define sample_data ebp + 8 ; JSAMPARRAY sample_data
+%define start_col ebp + 12 ; JDIMENSION start_col
+%define workspace ebp + 16 ; FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
+
+EXTN(jsimd_convsamp_float_sse):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pcmpeqw mm7, mm7
+ psllw mm7, 7
+ packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/2
+ alignx 16, 7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ psubb mm0, mm7 ; mm0=(01234567)
+ psubb mm1, mm7 ; mm1=(89ABCDEF)
+
+ punpcklbw mm2, mm0 ; mm2=(*0*1*2*3)
+ punpckhbw mm0, mm0 ; mm0=(*4*5*6*7)
+ punpcklbw mm3, mm1 ; mm3=(*8*9*A*B)
+ punpckhbw mm1, mm1 ; mm1=(*C*D*E*F)
+
+ punpcklwd mm4, mm2 ; mm4=(***0***1)
+ punpckhwd mm2, mm2 ; mm2=(***2***3)
+ punpcklwd mm5, mm0 ; mm5=(***4***5)
+ punpckhwd mm0, mm0 ; mm0=(***6***7)
+
+ psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01)
+ psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23)
+ cvtpi2ps xmm0, mm4 ; xmm0=(01**)
+ cvtpi2ps xmm1, mm2 ; xmm1=(23**)
+ psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45)
+ psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67)
+ cvtpi2ps xmm2, mm5 ; xmm2=(45**)
+ cvtpi2ps xmm3, mm0 ; xmm3=(67**)
+
+ punpcklwd mm6, mm3 ; mm6=(***8***9)
+ punpckhwd mm3, mm3 ; mm3=(***A***B)
+ punpcklwd mm4, mm1 ; mm4=(***C***D)
+ punpckhwd mm1, mm1 ; mm1=(***E***F)
+
+ psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89)
+ psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB)
+ cvtpi2ps xmm4, mm6 ; xmm4=(89**)
+ cvtpi2ps xmm5, mm3 ; xmm5=(AB**)
+ psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD)
+ psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF)
+ cvtpi2ps xmm6, mm4 ; xmm6=(CD**)
+ cvtpi2ps xmm7, mm1 ; xmm7=(EF**)
+
+ movlhps xmm0, xmm1 ; xmm0=(0123)
+ movlhps xmm2, xmm3 ; xmm2=(4567)
+ movlhps xmm4, xmm5 ; xmm4=(89AB)
+ movlhps xmm6, xmm7 ; xmm6=(CDEF)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+ add esi, byte 2*SIZEOF_JSAMPROW
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .convloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+; FAST_FLOAT *workspace);
+;
+
+%define coef_block ebp + 8 ; JCOEFPTR coef_block
+%define divisors ebp + 12 ; FAST_FLOAT *divisors
+%define workspace ebp + 16 ; FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_float_sse)
+
+EXTN(jsimd_quantize_float_sse):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/16
+ alignx 16, 7
+.quantloop:
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+ movhlps xmm4, xmm0
+ movhlps xmm5, xmm1
+
+ cvtps2pi mm0, xmm0
+ cvtps2pi mm1, xmm1
+ cvtps2pi mm4, xmm4
+ cvtps2pi mm5, xmm5
+
+ movhlps xmm6, xmm2
+ movhlps xmm7, xmm3
+
+ cvtps2pi mm2, xmm2
+ cvtps2pi mm3, xmm3
+ cvtps2pi mm6, xmm6
+ cvtps2pi mm7, xmm7
+
+ packssdw mm0, mm4
+ packssdw mm1, mm5
+ packssdw mm2, mm6
+ packssdw mm3, mm7
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+ add esi, byte 16*SIZEOF_FAST_FLOAT
+ add edx, byte 16*SIZEOF_FAST_FLOAT
+ add edi, byte 16*SIZEOF_JCOEF
+ dec eax
+ jnz short .quantloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jquantf-sse2.asm b/media/libjpeg/simd/i386/jquantf-sse2.asm
new file mode 100644
index 0000000000..a881ab50f9
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquantf-sse2.asm
@@ -0,0 +1,168 @@
+;
+; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+; FAST_FLOAT *workspace);
+;
+
+%define sample_data ebp + 8 ; JSAMPARRAY sample_data
+%define start_col ebp + 12 ; JDIMENSION start_col
+%define workspace ebp + 16 ; FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pcmpeqw xmm7, xmm7
+ psllw xmm7, 7
+ packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/2
+ alignx 16, 7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ psubb xmm0, xmm7 ; xmm0=(01234567)
+ psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
+
+ punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
+ punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
+
+ punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
+ punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
+ punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
+ punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
+
+ psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
+ psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
+ cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
+ cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
+ psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
+ psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
+ cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
+ cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+ add esi, byte 2*SIZEOF_JSAMPROW
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz short .convloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+; FAST_FLOAT *workspace);
+;
+
+%define coef_block ebp + 8 ; JCOEFPTR coef_block
+%define divisors ebp + 12 ; FAST_FLOAT *divisors
+%define workspace ebp + 16 ; FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/16
+ alignx 16, 7
+.quantloop:
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+ cvtps2dq xmm0, xmm0
+ cvtps2dq xmm1, xmm1
+ cvtps2dq xmm2, xmm2
+ cvtps2dq xmm3, xmm3
+
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+
+ movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+ add esi, byte 16*SIZEOF_FAST_FLOAT
+ add edx, byte 16*SIZEOF_FAST_FLOAT
+ add edi, byte 16*SIZEOF_JCOEF
+ dec eax
+ jnz short .quantloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jquanti-avx2.asm b/media/libjpeg/simd/i386/jquanti-avx2.asm
new file mode 100644
index 0000000000..5ed6bec246
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquanti-avx2.asm
@@ -0,0 +1,188 @@
+;
+; jquanti.asm - sample data conversion and quantization (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM *workspace);
+;
+
+%define sample_data ebp + 8 ; JSAMPARRAY sample_data
+%define start_col ebp + 12 ; JDIMENSION start_col
+%define workspace ebp + 16 ; DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ mov ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ mov ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ vinserti128 ymm0, ymm0, xmm1, 1
+ vinserti128 ymm2, ymm2, xmm3, 1
+ vinserti128 ymm4, ymm4, xmm5, 1
+ vinserti128 ymm6, ymm6, xmm7, 1
+
+ vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
+ vpunpcklbw ymm0, ymm0, ymm1
+ vpunpcklbw ymm2, ymm2, ymm1
+ vpunpcklbw ymm4, ymm4, ymm1
+ vpunpcklbw ymm6, ymm6, ymm1
+
+ vpcmpeqw ymm7, ymm7, ymm7
+ vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm2, ymm2, ymm7
+ vpaddw ymm4, ymm4, ymm7
+ vpaddw ymm6, ymm6, ymm7
+
+ vmovdqu YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+ vmovdqu YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
+ vmovdqu YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
+ vmovdqu YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
+
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+; DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+ YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+ YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+ YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block ebp + 8 ; JCOEFPTR coef_block
+%define divisors ebp + 12 ; DCTELEM *divisors
+%define workspace ebp + 16 ; DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+
+ vmovdqu ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+ vmovdqu ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+ vmovdqu ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
+ vmovdqu ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
+ vpabsw ymm0, ymm4
+ vpabsw ymm1, ymm5
+ vpabsw ymm2, ymm6
+ vpabsw ymm3, ymm7
+
+ vpaddw ymm0, YMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
+ vpaddw ymm1, YMMWORD [CORRECTION(2,0,edx)]
+ vpaddw ymm2, YMMWORD [CORRECTION(4,0,edx)]
+ vpaddw ymm3, YMMWORD [CORRECTION(6,0,edx)]
+ vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
+ vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
+ vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
+ vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
+ vpmulhuw ymm0, YMMWORD [SCALE(0,0,edx)] ; scale
+ vpmulhuw ymm1, YMMWORD [SCALE(2,0,edx)]
+ vpmulhuw ymm2, YMMWORD [SCALE(4,0,edx)]
+ vpmulhuw ymm3, YMMWORD [SCALE(6,0,edx)]
+
+ vpsignw ymm0, ymm0, ymm4
+ vpsignw ymm1, ymm1, ymm5
+ vpsignw ymm2, ymm2, ymm6
+ vpsignw ymm3, ymm3, ymm7
+
+ vmovdqu [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+ vmovdqu [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
+ vmovdqu [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
+ vmovdqu [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
+
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jquanti-sse2.asm b/media/libjpeg/simd/i386/jquanti-sse2.asm
new file mode 100644
index 0000000000..0a509408aa
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquanti-sse2.asm
@@ -0,0 +1,201 @@
+;
+; jquanti.asm - sample data conversion and quantization (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM *workspace);
+;
+
+%define sample_data ebp + 8 ; JSAMPARRAY sample_data
+%define start_col ebp + 12 ; JDIMENSION start_col
+%define workspace ebp + 16 ; DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pxor xmm6, xmm6 ; xmm6=(all 0's)
+ pcmpeqw xmm7, xmm7
+ psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
+ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
+
+ mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
+ movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
+
+ punpcklbw xmm0, xmm6 ; xmm0=(01234567)
+ punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+ punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
+ punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
+ paddw xmm2, xmm7
+ paddw xmm3, xmm7
+
+ movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+ add esi, byte 4*SIZEOF_JSAMPROW
+ add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz short .convloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+; DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+ XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+ XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+ XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block ebp + 8 ; JCOEFPTR coef_block
+%define divisors ebp + 12 ; DCTELEM *divisors
+%define workspace ebp + 16 ; DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/32
+ alignx 16, 7
+.quantloop:
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm7
+ psraw xmm4, (WORD_BIT-1)
+ psraw xmm5, (WORD_BIT-1)
+ psraw xmm6, (WORD_BIT-1)
+ psraw xmm7, (WORD_BIT-1)
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pxor xmm3, xmm7
+ psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
+ psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
+ psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
+ psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
+
+ paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
+ paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
+ paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
+ paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
+ pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
+ pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+ pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+ pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+ pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale
+ pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
+ pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
+ pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
+
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pxor xmm3, xmm7
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+ psubw xmm2, xmm6
+ psubw xmm3, xmm7
+ movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+ add esi, byte 32*SIZEOF_DCTELEM
+ add edx, byte 32*SIZEOF_DCTELEM
+ add edi, byte 32*SIZEOF_JCOEF
+ dec eax
+ jnz near .quantloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/i386/jsimd.c b/media/libjpeg/simd/i386/jsimd.c
new file mode 100644
index 0000000000..b429b0a532
--- /dev/null
+++ b/media/libjpeg/simd/i386/jsimd.c
@@ -0,0 +1,1312 @@
+/*
+ * jsimd_i386.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022-2023, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0);
+static THREAD_LOCAL unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char env[2] = { 0 };
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = jpeg_simd_cpu_support();
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ if (!GETENV_S(env, 2, "JSIMD_FORCEMMX") && !strcmp(env, "1"))
+ simd_support &= JSIMD_MMX;
+ if (!GETENV_S(env, 2, "JSIMD_FORCE3DNOW") && !strcmp(env, "1"))
+ simd_support &= JSIMD_3DNOW | JSIMD_MMX;
+ if (!GETENV_S(env, 2, "JSIMD_FORCESSE") && !strcmp(env, "1"))
+ simd_support &= JSIMD_SSE | JSIMD_MMX;
+ if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
+ simd_support &= JSIMD_SSE2;
+ if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
+ simd_support &= JSIMD_AVX2;
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+ simd_support = 0;
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+ simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ if (simd_support == ~0U)
+ init_simd();
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_extrgb_ycc_convert_avx2;
+ sse2fct = jsimd_extrgb_ycc_convert_sse2;
+ mmxfct = jsimd_extrgb_ycc_convert_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+ sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+ mmxfct = jsimd_extrgbx_ycc_convert_mmx;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_extbgr_ycc_convert_avx2;
+ sse2fct = jsimd_extbgr_ycc_convert_sse2;
+ mmxfct = jsimd_extbgr_ycc_convert_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+ sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+ mmxfct = jsimd_extbgrx_ycc_convert_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+ sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+ mmxfct = jsimd_extxbgr_ycc_convert_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+ sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+ mmxfct = jsimd_extxrgb_ycc_convert_mmx;
+ break;
+ default:
+ avx2fct = jsimd_rgb_ycc_convert_avx2;
+ sse2fct = jsimd_rgb_ycc_convert_sse2;
+ mmxfct = jsimd_rgb_ycc_convert_mmx;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ else if (simd_support & JSIMD_SSE2)
+ sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ else
+ mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ if (simd_support == ~0U)
+ init_simd();
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_extrgb_gray_convert_avx2;
+ sse2fct = jsimd_extrgb_gray_convert_sse2;
+ mmxfct = jsimd_extrgb_gray_convert_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_extrgbx_gray_convert_avx2;
+ sse2fct = jsimd_extrgbx_gray_convert_sse2;
+ mmxfct = jsimd_extrgbx_gray_convert_mmx;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_extbgr_gray_convert_avx2;
+ sse2fct = jsimd_extbgr_gray_convert_sse2;
+ mmxfct = jsimd_extbgr_gray_convert_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_extbgrx_gray_convert_avx2;
+ sse2fct = jsimd_extbgrx_gray_convert_sse2;
+ mmxfct = jsimd_extbgrx_gray_convert_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_extxbgr_gray_convert_avx2;
+ sse2fct = jsimd_extxbgr_gray_convert_sse2;
+ mmxfct = jsimd_extxbgr_gray_convert_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_extxrgb_gray_convert_avx2;
+ sse2fct = jsimd_extxrgb_gray_convert_sse2;
+ mmxfct = jsimd_extxrgb_gray_convert_mmx;
+ break;
+ default:
+ avx2fct = jsimd_rgb_gray_convert_avx2;
+ sse2fct = jsimd_rgb_gray_convert_sse2;
+ mmxfct = jsimd_rgb_gray_convert_mmx;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ else if (simd_support & JSIMD_SSE2)
+ sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ else
+ mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+ void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+ void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ if (simd_support == ~0U)
+ init_simd();
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_ycc_extrgb_convert_avx2;
+ sse2fct = jsimd_ycc_extrgb_convert_sse2;
+ mmxfct = jsimd_ycc_extrgb_convert_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+ sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+ mmxfct = jsimd_ycc_extrgbx_convert_mmx;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_ycc_extbgr_convert_avx2;
+ sse2fct = jsimd_ycc_extbgr_convert_sse2;
+ mmxfct = jsimd_ycc_extbgr_convert_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+ sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+ mmxfct = jsimd_ycc_extbgrx_convert_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+ sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+ mmxfct = jsimd_ycc_extxbgr_convert_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+ sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+ mmxfct = jsimd_ycc_extxrgb_convert_mmx;
+ break;
+ default:
+ avx2fct = jsimd_ycc_rgb_convert_avx2;
+ sse2fct = jsimd_ycc_rgb_convert_sse2;
+ mmxfct = jsimd_ycc_rgb_convert_mmx;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+ else if (simd_support & JSIMD_SSE2)
+ sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+ else
+ mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+ else
+ jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+ else
+ jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+ else
+ jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+ else
+ jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+ else
+ jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+ else
+ jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ if (simd_support == ~0U)
+ init_simd();
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_extrgb_merged_upsample_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_extrgbx_merged_upsample_mmx;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_extbgr_merged_upsample_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_extbgrx_merged_upsample_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_extxbgr_merged_upsample_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_extxrgb_merged_upsample_mmx;
+ break;
+ default:
+ avx2fct = jsimd_h2v2_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_merged_upsample_sse2;
+ mmxfct = jsimd_h2v2_merged_upsample_mmx;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+ else if (simd_support & JSIMD_SSE2)
+ sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+ else
+ mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ if (simd_support == ~0U)
+ init_simd();
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_extrgb_merged_upsample_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_extrgbx_merged_upsample_mmx;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_extbgr_merged_upsample_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_extbgrx_merged_upsample_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_extxbgr_merged_upsample_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_extxrgb_merged_upsample_mmx;
+ break;
+ default:
+ avx2fct = jsimd_h2v1_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_merged_upsample_sse2;
+ mmxfct = jsimd_h2v1_merged_upsample_mmx;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+ else if (simd_support & JSIMD_SSE2)
+ sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+ else
+ mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_SSE)
+ return 1;
+ if (simd_support & JSIMD_3DNOW)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_convsamp_avx2(sample_data, start_col, workspace);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_convsamp_sse2(sample_data, start_col, workspace);
+ else
+ jsimd_convsamp_mmx(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_SSE2)
+ jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+ else if (simd_support & JSIMD_SSE)
+ jsimd_convsamp_float_sse(sample_data, start_col, workspace);
+ else
+ jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+ return 1;
+ if (simd_support & JSIMD_3DNOW)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_fdct_islow_avx2(data);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_fdct_islow_sse2(data);
+ else
+ jsimd_fdct_islow_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+ jsimd_fdct_ifast_sse2(data);
+ else
+ jsimd_fdct_ifast_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+ jsimd_fdct_float_sse(data);
+ else if (simd_support & JSIMD_3DNOW)
+ jsimd_fdct_float_3dnow(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_SSE)
+ return 1;
+ if (simd_support & JSIMD_3DNOW)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_quantize_avx2(coef_block, divisors, workspace);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_quantize_sse2(coef_block, divisors, workspace);
+ else
+ jsimd_quantize_mmx(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_SSE2)
+ jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+ else if (simd_support & JSIMD_SSE)
+ jsimd_quantize_float_sse(coef_block, divisors, workspace);
+ else
+ jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else
+ jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else
+ jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+ if (sizeof(FLOAT_MULT_TYPE) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+ return 1;
+ if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+ return 1;
+ if (simd_support & JSIMD_3DNOW)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else if (simd_support & JSIMD_SSE2)
+ jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else
+ jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+ jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else
+ jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+ jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+ jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else
+ jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+ IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 4)
+ return 0;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 4)
+ return 0;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+ jpeg_natural_order_start,
+ Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/i386/jsimdcpu.asm b/media/libjpeg/simd/i386/jsimdcpu.asm
new file mode 100644
index 0000000000..ddcafa9e21
--- /dev/null
+++ b/media/libjpeg/simd/i386/jsimdcpu.asm
@@ -0,0 +1,135 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+ align 32
+ GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+ push edi
+
+ xor edi, edi ; simd support flag
+
+ pushfd
+ pop eax
+ mov edx, eax
+ xor eax, 1<<21 ; flip ID bit in EFLAGS
+ push eax
+ popfd
+ pushfd
+ pop eax
+ xor eax, edx
+ jz near .return ; CPUID is not supported
+
+ ; Check whether CPUID leaf 07H is supported
+ ; (leaf 07H is used to check for AVX2 instruction support)
+ xor eax, eax
+ cpuid
+ test eax, eax
+ jz near .return
+ cmp eax, 7
+ jl short .no_avx2 ; Maximum leaf < 07H
+
+ ; Check for AVX2 instruction support
+ mov eax, 7
+ xor ecx, ecx
+ cpuid
+ mov eax, ebx
+ test eax, 1<<5 ; bit5:AVX2
+ jz short .no_avx2
+
+ ; Check for AVX2 O/S support
+ mov eax, 1
+ xor ecx, ecx
+ cpuid
+ test ecx, 1<<27
+ jz short .no_avx2 ; O/S does not support XSAVE
+ test ecx, 1<<28
+ jz short .no_avx2 ; CPU does not support AVX2
+
+ xor ecx, ecx
+ xgetbv
+ and eax, 6
+ cmp eax, 6 ; O/S does not manage XMM/YMM state
+ ; using XSAVE
+ jnz short .no_avx2
+
+ or edi, JSIMD_AVX2
+.no_avx2:
+
+ ; Check CPUID leaf 01H for MMX, SSE, and SSE2 support
+ xor eax, eax
+ inc eax
+ cpuid
+ mov eax, edx ; eax = Standard feature flags
+
+ ; Check for MMX instruction support
+ test eax, 1<<23 ; bit23:MMX
+ jz short .no_mmx
+ or edi, byte JSIMD_MMX
+.no_mmx:
+ test eax, 1<<25 ; bit25:SSE
+ jz short .no_sse
+ or edi, byte JSIMD_SSE
+.no_sse:
+ test eax, 1<<26 ; bit26:SSE2
+ jz short .no_sse2
+ or edi, byte JSIMD_SSE2
+.no_sse2:
+
+ ; Check for 3DNow! instruction support
+ mov eax, 0x80000000
+ cpuid
+ cmp eax, 0x80000000
+ jbe short .return
+
+ mov eax, 0x80000001
+ cpuid
+ mov eax, edx ; eax = Extended feature flags
+
+ test eax, 1<<31 ; bit31:3DNow!(vendor independent)
+ jz short .no_3dnow
+ or edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+ mov eax, edi
+
+ pop edi
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/jsimd.h b/media/libjpeg/simd/jsimd.h
new file mode 100644
index 0000000000..a28754adb9
--- /dev/null
+++ b/media/libjpeg/simd/jsimd.h
@@ -0,0 +1,1258 @@
+/*
+ * simd/jsimd.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, 2014-2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2014, Linaro Limited.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ */
+
+/* Bitmask for supported acceleration methods */
+
+#define JSIMD_NONE 0x00
+#define JSIMD_MMX 0x01
+#define JSIMD_3DNOW 0x02
+#define JSIMD_SSE 0x04
+#define JSIMD_SSE2 0x08
+#define JSIMD_NEON 0x10
+#define JSIMD_DSPR2 0x20
+#define JSIMD_ALTIVEC 0x40
+#define JSIMD_AVX2 0x80
+#define JSIMD_MMI 0x100
+
+/* SIMD Ext: retrieve SIMD/CPU information */
+EXTERN(unsigned int) jpeg_simd_cpu_support(void);
+
+/* RGB & extended RGB --> YCC Colorspace Conversion */
+EXTERN(void) jsimd_rgb_ycc_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_ycc_convert_sse2[];
+EXTERN(void) jsimd_rgb_ycc_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_ycc_convert_avx2[];
+EXTERN(void) jsimd_rgb_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+#ifndef NEON_INTRINSICS
+
+EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+#endif
+
+EXTERN(void) jsimd_rgb_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+/* RGB & extended RGB --> Grayscale Colorspace Conversion */
+EXTERN(void) jsimd_rgb_gray_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmx
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_gray_convert_sse2[];
+EXTERN(void) jsimd_rgb_gray_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_sse2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_gray_convert_avx2[];
+EXTERN(void) jsimd_rgb_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_avx2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_altivec
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+/* YCC --> RGB & extended RGB Colorspace Conversion */
+EXTERN(void) jsimd_ycc_rgb_convert_mmx
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_mmx
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_mmx
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+extern const int jconst_ycc_rgb_convert_sse2[];
+EXTERN(void) jsimd_ycc_rgb_convert_sse2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_sse2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_sse2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+extern const int jconst_ycc_rgb_convert_avx2[];
+EXTERN(void) jsimd_ycc_rgb_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_avx2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_neon
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_neon
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_neon
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_neon
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_neon
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_neon
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_neon
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_rgb565_convert_neon
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+#ifndef NEON_INTRINSICS
+
+EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+#endif
+
+EXTERN(void) jsimd_ycc_rgb_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_dspr2
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmi
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_altivec
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+/* NULL Colorspace Conversion */
+EXTERN(void) jsimd_c_null_convert_dspr2
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows, int num_components);
+
+/* h2v1 Downsampling */
+EXTERN(void) jsimd_h2v1_downsample_mmx
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_sse2
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_avx2
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_neon
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_dspr2
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_altivec
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+/* h2v2 Downsampling */
+EXTERN(void) jsimd_h2v2_downsample_mmx
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_sse2
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_avx2
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_neon
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_dspr2
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_mmi
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_altivec
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+/* h2v2 Smooth Downsampling */
+EXTERN(void) jsimd_h2v2_smooth_downsample_dspr2
+ (JSAMPARRAY input_data, JSAMPARRAY output_data, JDIMENSION v_samp_factor,
+ int max_v_samp_factor, int smoothing_factor, JDIMENSION width_in_blocks,
+ JDIMENSION image_width);
+
+
+/* Upsampling */
+EXTERN(void) jsimd_h2v1_upsample_mmx
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_mmx
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_sse2
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_sse2
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_avx2
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_avx2
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_neon
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_neon
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_dspr2
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_dspr2
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_int_upsample_dspr2
+ (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
+ int max_v_samp_factor);
+
+EXTERN(void) jsimd_h2v1_upsample_altivec
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_altivec
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+/* Fancy Upsampling */
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+extern const int jconst_fancy_upsample_sse2[];
+EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+extern const int jconst_fancy_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_fancy_upsample_avx2
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_avx2
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_neon
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_neon
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample_neon
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_dspr2
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmi
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmi
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_altivec
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_altivec
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
+/* Merged Upsampling */
+EXTERN(void) jsimd_h2v1_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+extern const int jconst_merged_upsample_sse2[];
+EXTERN(void) jsimd_h2v1_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+extern const int jconst_merged_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_dspr2
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+/* Sample Conversion */
+EXTERN(void) jsimd_convsamp_mmx
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_sse2
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_avx2
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_neon
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_dspr2
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_altivec
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+/* Floating Point Sample Conversion */
+EXTERN(void) jsimd_convsamp_float_3dnow
+ (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_convsamp_float_sse
+ (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_convsamp_float_sse2
+ (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_convsamp_float_dspr2
+ (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+/* Accurate Integer Forward DCT */
+EXTERN(void) jsimd_fdct_islow_mmx(DCTELEM *data);
+
+extern const int jconst_fdct_islow_sse2[];
+EXTERN(void) jsimd_fdct_islow_sse2(DCTELEM *data);
+
+extern const int jconst_fdct_islow_avx2[];
+EXTERN(void) jsimd_fdct_islow_avx2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_neon(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_dspr2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_mmi(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_altivec(DCTELEM *data);
+
+/* Fast Integer Forward DCT */
+EXTERN(void) jsimd_fdct_ifast_mmx(DCTELEM *data);
+
+extern const int jconst_fdct_ifast_sse2[];
+EXTERN(void) jsimd_fdct_ifast_sse2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_neon(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_mmi(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data);
+
+/* Floating Point Forward DCT */
+EXTERN(void) jsimd_fdct_float_3dnow(FAST_FLOAT *data);
+
+extern const int jconst_fdct_float_sse[];
+EXTERN(void) jsimd_fdct_float_sse(FAST_FLOAT *data);
+
+/* Quantization */
+EXTERN(void) jsimd_quantize_mmx
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_sse2
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_avx2
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_neon
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_dspr2
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_mmi
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_altivec
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+/* Floating Point Quantization */
+EXTERN(void) jsimd_quantize_float_3dnow
+ (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_quantize_float_sse
+ (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_quantize_float_sse2
+ (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_quantize_float_dspr2
+ (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+/* Scaled Inverse DCT */
+EXTERN(void) jsimd_idct_2x2_mmx
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_mmx
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+extern const int jconst_idct_red_sse2[];
+EXTERN(void) jsimd_idct_2x2_sse2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_sse2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_2x2_neon
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_neon
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_2x2_dspr2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_dspr2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col, int *workspace);
+EXTERN(void) jsimd_idct_6x6_dspr2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12_pass1_dspr2
+ (JCOEFPTR coef_block, void *dct_table, int *workspace);
+EXTERN(void) jsimd_idct_12x12_pass2_dspr2
+ (int *workspace, int *output);
+
+/* Accurate Integer Inverse DCT */
+EXTERN(void) jsimd_idct_islow_mmx
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+extern const int jconst_idct_islow_sse2[];
+EXTERN(void) jsimd_idct_islow_sse2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+extern const int jconst_idct_islow_avx2[];
+EXTERN(void) jsimd_idct_islow_avx2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_neon
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_dspr2
+ (void *dct_table, JCOEFPTR coef_block, int *output_buf, JSAMPLE *output_col);
+
+EXTERN(void) jsimd_idct_islow_mmi
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_altivec
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+/* Fast Integer Inverse DCT */
+EXTERN(void) jsimd_idct_ifast_mmx
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+extern const int jconst_idct_ifast_sse2[];
+EXTERN(void) jsimd_idct_ifast_sse2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_ifast_neon
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_ifast_cols_dspr2
+ (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
+ const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_rows_dspr2
+ (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
+ const int *idct_coefs);
+
+EXTERN(void) jsimd_idct_ifast_mmi
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_ifast_altivec
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+/* Floating Point Inverse DCT */
+EXTERN(void) jsimd_idct_float_3dnow
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+extern const int jconst_idct_float_sse[];
+EXTERN(void) jsimd_idct_float_sse
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+extern const int jconst_idct_float_sse2[];
+EXTERN(void) jsimd_idct_float_sse2
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+/* Huffman coding */
+extern const int jconst_huff_encode_one_block[];
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_sse2
+ (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon
+ (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+#ifndef NEON_INTRINSICS
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
+ (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+#endif
+
+/* Progressive Huffman encoding */
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ UJCOEF *values, size_t *zerobits);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ UJCOEF *values, size_t *zerobits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ UJCOEF *absvalues, size_t *bits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ UJCOEF *absvalues, size_t *bits);
diff --git a/media/libjpeg/simd/mips/jsimd.c b/media/libjpeg/simd/mips/jsimd.c
new file mode 100644
index 0000000000..c6e789aa2f
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd.c
@@ -0,0 +1,1143 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <ctype.h>
+
+static THREAD_LOCAL unsigned int simd_support = ~0;
+
+#if !(defined(__mips_dsp) && (__mips_dsp_rev >= 2)) && defined(__linux__)
+
+LOCAL(void)
+parse_proc_cpuinfo(const char *search_string)
+{
+ const char *file_name = "/proc/cpuinfo";
+ char cpuinfo_line[256];
+ FILE *f = NULL;
+
+ simd_support = 0;
+
+ if ((f = fopen(file_name, "r")) != NULL) {
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+ if (strstr(cpuinfo_line, search_string) != NULL) {
+ fclose(f);
+ simd_support |= JSIMD_DSPR2;
+ return;
+ }
+ }
+ fclose(f);
+ }
+ /* Did not find string in the proc file, or not Linux ELF. */
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char *env = NULL;
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+ simd_support |= JSIMD_DSPR2;
+#elif defined(__linux__)
+ /* We still have a chance to use MIPS DSPR2 regardless of globally used
+ * -mdspr2 options passed to gcc by performing runtime detection via
+ * /proc/cpuinfo parsing on linux */
+ parse_proc_cpuinfo("MIPS 74K");
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ env = getenv("JSIMD_FORCEDSPR2");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = JSIMD_DSPR2;
+ env = getenv("JSIMD_FORCENONE");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = 0;
+#endif
+}
+
+static const int mips_idct_ifast_coefs[4] = {
+ 0x45404540, /* FIX( 1.082392200 / 2) = 17734 = 0x4546 */
+ 0x5A805A80, /* FIX( 1.414213562 / 2) = 23170 = 0x5A82 */
+ 0x76407640, /* FIX( 1.847759065 / 2) = 30274 = 0x7642 */
+ 0xAC60AC60 /* FIX(-2.613125930 / 4) = -21407 = 0xAC61 */
+};
+
+/* The following struct is borrowed from jdsample.c */
+typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+typedef struct {
+ struct jpeg_upsampler pub;
+ JSAMPARRAY color_buf[MAX_COMPONENTS];
+ upsample1_ptr methods[MAX_COMPONENTS];
+ int next_row_out;
+ JDIMENSION rows_to_go;
+ int rowgroup_height[MAX_COMPONENTS];
+ UINT8 h_expand[MAX_COMPONENTS];
+ UINT8 v_expand[MAX_COMPONENTS];
+} my_upsampler;
+
+typedef my_upsampler *my_upsample_ptr;
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_extrgbx_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_extbgr_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_extbgrx_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_extxbgr_ycc_convert_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_extxrgb_ycc_convert_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_extrgbx_gray_convert_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_extbgr_gray_convert_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_extbgrx_gray_convert_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_extxbgr_gray_convert_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_extxrgb_gray_convert_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_ycc_extrgbx_convert_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_ycc_extbgr_convert_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_ycc_extbgrx_convert_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_ycc_extxbgr_convert_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_ycc_extxrgb_convert_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ jsimd_c_null_convert_dspr2(cinfo->image_width, input_buf, output_buf,
+ output_row, num_rows, cinfo->num_components);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ /* FIXME: jsimd_h2v2_downsample_dspr2() fails some of the TJBench tiling
+ * regression tests, probably because the DSPr2 SIMD implementation predates
+ * those tests. */
+#if 0
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ /* FIXME: jsimd_h2v1_downsample_dspr2() fails some of the TJBench tiling
+ * regression tests, probably because the DSPr2 SIMD implementation predates
+ * those tests. */
+#if 0
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_smooth_downsample_dspr2(input_data, output_data,
+ compptr->v_samp_factor,
+ cinfo->max_v_samp_factor,
+ cinfo->smoothing_factor,
+ compptr->width_in_blocks,
+ cinfo->image_width);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
+
+ jsimd_int_upsample_dspr2(upsample->h_expand[compptr->component_index],
+ upsample->v_expand[compptr->component_index],
+ input_data, output_data_ptr, cinfo->output_width,
+ cinfo->max_v_samp_factor);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_h2v2_extrgbx_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_h2v2_extbgr_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_h2v2_extbgrx_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_h2v2_extxbgr_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_h2v2_extxrgb_merged_upsample_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+ cinfo->sample_range_limit);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ dspr2fct = jsimd_h2v1_extrgbx_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ dspr2fct = jsimd_h2v1_extbgr_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ dspr2fct = jsimd_h2v1_extbgrx_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ dspr2fct = jsimd_h2v1_extxbgr_merged_upsample_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ dspr2fct = jsimd_h2v1_extxrgb_merged_upsample_dspr2;
+ break;
+ default:
+ dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+ break;
+ }
+
+ dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+ cinfo->sample_range_limit);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+#ifndef __mips_soft_float
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ jsimd_convsamp_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+#ifndef __mips_soft_float
+ jsimd_convsamp_float_dspr2(sample_data, start_col, workspace);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+#ifndef __mips_soft_float
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+#ifndef __mips_soft_float
+ jsimd_quantize_float_dspr2(coef_block, divisors, workspace);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+ init_simd();
+
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_2x2_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ int workspace[DCTSIZE * 4]; /* buffers data between passes */
+
+ jsimd_idct_4x4_dspr2(compptr->dct_table, coef_block, output_buf, output_col,
+ workspace);
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_6x6_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ int workspace[96];
+ int output[12] = {
+ (int)(output_buf[0] + output_col),
+ (int)(output_buf[1] + output_col),
+ (int)(output_buf[2] + output_col),
+ (int)(output_buf[3] + output_col),
+ (int)(output_buf[4] + output_col),
+ (int)(output_buf[5] + output_col),
+ (int)(output_buf[6] + output_col),
+ (int)(output_buf[7] + output_col),
+ (int)(output_buf[8] + output_col),
+ (int)(output_buf[9] + output_col),
+ (int)(output_buf[10] + output_col),
+ (int)(output_buf[11] + output_col)
+ };
+
+ jsimd_idct_12x12_pass1_dspr2(coef_block, compptr->dct_table, workspace);
+ jsimd_idct_12x12_pass2_dspr2(workspace, output);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+#if defined(__MIPSEL__)
+ if (simd_support & JSIMD_DSPR2)
+ return 1;
+#endif
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ int output[8] = {
+ (int)(output_buf[0] + output_col),
+ (int)(output_buf[1] + output_col),
+ (int)(output_buf[2] + output_col),
+ (int)(output_buf[3] + output_col),
+ (int)(output_buf[4] + output_col),
+ (int)(output_buf[5] + output_col),
+ (int)(output_buf[6] + output_col),
+ (int)(output_buf[7] + output_col)
+ };
+
+ jsimd_idct_islow_dspr2(coef_block, compptr->dct_table, output,
+ IDCT_range_limit(cinfo));
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ JCOEFPTR inptr;
+ IFAST_MULT_TYPE *quantptr;
+ DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ inptr = coef_block;
+ quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
+
+ jsimd_idct_ifast_cols_dspr2(inptr, quantptr, workspace,
+ mips_idct_ifast_coefs);
+
+ /* Pass 2: process rows from work array, store into output array. */
+ /* Note that we must descale the results by a factor of 8 == 2**3, */
+ /* and also undo the PASS1_BITS scaling. */
+
+ jsimd_idct_ifast_rows_dspr2(workspace, output_buf, output_col,
+ mips_idct_ifast_coefs);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
diff --git a/media/libjpeg/simd/mips/jsimd_dspr2.S b/media/libjpeg/simd/mips/jsimd_dspr2.S
new file mode 100644
index 0000000000..c99288a8d1
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd_dspr2.S
@@ -0,0 +1,4543 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * All Rights Reserved.
+ * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com>
+ * Darko Laus <darko.laus@imgtec.com>
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_dspr2_asm.h"
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_c_null_convert_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = input_buf
+ * a2 = output_buf
+ * a3 = output_row
+ * 16(sp) = num_rows
+ * 20(sp) = cinfo->num_components
+ *
+ * Null conversion for compression
+ */
+ SAVE_REGS_ON_STACK 8, s0, s1
+
+ lw t9, 24(sp) /* t9 = num_rows */
+ lw s0, 28(sp) /* s0 = cinfo->num_components */
+ andi t0, a0, 3 /* t0 = cinfo->image_width & 3 */
+ beqz t0, 4f /* no residual */
+ nop
+0:
+ addiu t9, t9, -1
+ bltz t9, 7f
+ li t1, 0
+1:
+ sll t3, t1, 2
+ lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */
+ lw t2, 0(a1) /* t2 = inptr = *input_buf */
+ sll t4, a3, 2
+ lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */
+ addu t2, t2, t1
+ addu s1, t5, a0
+ addu t6, t5, t0
+2:
+ lbu t3, 0(t2)
+ addiu t5, t5, 1
+ sb t3, -1(t5)
+ bne t6, t5, 2b
+ addu t2, t2, s0
+3:
+ lbu t3, 0(t2)
+ addu t4, t2, s0
+ addu t7, t4, s0
+ addu t8, t7, s0
+ addu t2, t8, s0
+ lbu t4, 0(t4)
+ lbu t7, 0(t7)
+ lbu t8, 0(t8)
+ addiu t5, t5, 4
+ sb t3, -4(t5)
+ sb t4, -3(t5)
+ sb t7, -2(t5)
+ bne s1, t5, 3b
+ sb t8, -1(t5)
+ addiu t1, t1, 1
+ bne t1, s0, 1b
+ nop
+ addiu a1, a1, 4
+ bgez t9, 0b
+ addiu a3, a3, 1
+ b 7f
+ nop
+4:
+ addiu t9, t9, -1
+ bltz t9, 7f
+ li t1, 0
+5:
+ sll t3, t1, 2
+ lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */
+ lw t2, 0(a1) /* t2 = inptr = *input_buf */
+ sll t4, a3, 2
+ lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */
+ addu t2, t2, t1
+ addu s1, t5, a0
+ addu t6, t5, t0
+6:
+ lbu t3, 0(t2)
+ addu t4, t2, s0
+ addu t7, t4, s0
+ addu t8, t7, s0
+ addu t2, t8, s0
+ lbu t4, 0(t4)
+ lbu t7, 0(t7)
+ lbu t8, 0(t8)
+ addiu t5, t5, 4
+ sb t3, -4(t5)
+ sb t4, -3(t5)
+ sb t7, -2(t5)
+ bne s1, t5, 6b
+ sb t8, -1(t5)
+ addiu t1, t1, 1
+ bne t1, s0, 5b
+ nop
+ addiu a1, a1, 4
+ bgez t9, 4b
+ addiu a3, a3, 1
+7:
+ RESTORE_REGS_FROM_STACK 8, s0, s1
+
+ j ra
+ nop
+
+END(jsimd_c_null_convert_dspr2)
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_dspr2
+ * jsimd_extbgr_ycc_convert_dspr2
+ * jsimd_extrgbx_ycc_convert_dspr2
+ * jsimd_extbgrx_ycc_convert_dspr2
+ * jsimd_extxbgr_ycc_convert_dspr2
+ * jsimd_extxrgb_ycc_convert_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \
+ r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC r, g, b, inptr
+ lbu \r, \r_offs(\inptr)
+ lbu \g, \g_offs(\inptr)
+ lbu \b, \b_offs(\inptr)
+ addiu \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = input_buf
+ * a2 = output_buf
+ * a3 = output_row
+ * 16(sp) = num_rows
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw t7, 48(sp) /* t7 = num_rows */
+ li s0, 0x4c8b /* FIX(0.29900) */
+ li s1, 0x9646 /* FIX(0.58700) */
+ li s2, 0x1d2f /* FIX(0.11400) */
+ li s3, 0xffffd4cd /* -FIX(0.16874) */
+ li s4, 0xffffab33 /* -FIX(0.33126) */
+ li s5, 0x8000 /* FIX(0.50000) */
+ li s6, 0xffff94d1 /* -FIX(0.41869) */
+ li s7, 0xffffeb2f /* -FIX(0.08131) */
+ li t8, 0x807fff /* CBCR_OFFSET + ONE_HALF-1 */
+
+0:
+ addiu t7, -1 /* --num_rows */
+ lw t6, 0(a1) /* t6 = input_buf[0] */
+ lw t0, 0(a2)
+ lw t1, 4(a2)
+ lw t2, 8(a2)
+ sll t3, a3, 2
+ lwx t0, t3(t0) /* t0 = output_buf[0][output_row] */
+ lwx t1, t3(t1) /* t1 = output_buf[1][output_row] */
+ lwx t2, t3(t2) /* t2 = output_buf[2][output_row] */
+
+ addu t9, t2, a0 /* t9 = end address */
+ addiu a3, 1
+
+1:
+ DO_RGB_TO_YCC t3, t4, t5, t6
+
+ mtlo s5, $ac0
+ mtlo t8, $ac1
+ mtlo t8, $ac2
+ maddu $ac0, s2, t5
+ maddu $ac1, s5, t5
+ maddu $ac2, s5, t3
+ maddu $ac0, s0, t3
+ maddu $ac1, s3, t3
+ maddu $ac2, s6, t4
+ maddu $ac0, s1, t4
+ maddu $ac1, s4, t4
+ maddu $ac2, s7, t5
+ extr.w t3, $ac0, 16
+ extr.w t4, $ac1, 16
+ extr.w t5, $ac2, 16
+ sb t3, 0(t0)
+ sb t4, 0(t1)
+ sb t5, 0(t2)
+ addiu t0, 1
+ addiu t2, 1
+ bne t2, t9, 1b
+ addiu t1, 1
+ bgtz t7, 0b
+ addiu a1, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_\colorid\()_ycc_convert_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*-------------------------------------id -- pix R G B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_dspr2
+ * jsimd_ycc_extbgr_convert_dspr2
+ * jsimd_ycc_extrgbx_convert_dspr2
+ * jsimd_ycc_extbgrx_convert_dspr2
+ * jsimd_ycc_extxbgr_convert_dspr2
+ * jsimd_ycc_extxrgb_convert_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \
+ r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr
+ sb \scratch0, \r_offs(\outptr)
+ sb \scratch1, \g_offs(\outptr)
+ sb \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+ li t0, 0xFF
+ sb t0, \a_offs(\outptr)
+.endif
+ addiu \outptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = input_buf
+ * a2 = input_row
+ * a3 = output_buf
+ * 16(sp) = num_rows
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw s1, 48(sp)
+ li t3, 0x8000
+ li t4, 0x166e9 /* FIX(1.40200) */
+ li t5, 0x1c5a2 /* FIX(1.77200) */
+ li t6, 0xffff492e /* -FIX(0.71414) */
+ li t7, 0xffffa7e6 /* -FIX(0.34414) */
+ repl.ph t8, 128
+
+0:
+ lw s0, 0(a3)
+ lw t0, 0(a1)
+ lw t1, 4(a1)
+ lw t2, 8(a1)
+ sll s5, a2, 2
+ addiu s1, -1
+ lwx s2, s5(t0)
+ lwx s3, s5(t1)
+ lwx s4, s5(t2)
+ addu t9, s2, a0
+ addiu a2, 1
+
+1:
+ lbu s7, 0(s4) /* cr */
+ lbu s6, 0(s3) /* cb */
+ lbu s5, 0(s2) /* y */
+ addiu s2, 1
+ addiu s4, 1
+ addiu s7, -128
+ addiu s6, -128
+ mul t2, t7, s6
+ mul t0, t6, s7 /* Crgtab[cr] */
+ sll s7, 15
+ mulq_rs.w t1, t4, s7 /* Crrtab[cr] */
+ sll s6, 15
+ addu t2, t3 /* Cbgtab[cb] */
+ addu t2, t0
+
+ mulq_rs.w t0, t5, s6 /* Cbbtab[cb] */
+ sra t2, 16
+ addu t1, s5
+ addu t2, s5 /* add y */
+ ins t2, t1, 16, 16
+ subu.ph t2, t2, t8
+ addu t0, s5
+ shll_s.ph t2, t2, 8
+ subu t0, 128
+ shra.ph t2, t2, 8
+ shll_s.w t0, t0, 24
+ addu.ph t2, t2, t8 /* clip & store */
+ sra t0, t0, 24
+ sra t1, t2, 16
+ addiu t0, 128
+
+ STORE_YCC_TO_RGB t1, t2, t0, s0
+
+ bne s2, t9, 1b
+ addiu s3, 1
+ bgtz s1, 0b
+ addiu a3, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_ycc_\colorid\()_convert_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*-------------------------------------id -- pix R G B A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_gray_convert_dspr2
+ * jsimd_extbgr_gray_convert_dspr2
+ * jsimd_extrgbx_gray_convert_dspr2
+ * jsimd_extbgrx_gray_convert_dspr2
+ * jsimd_extxbgr_gray_convert_dspr2
+ * jsimd_extxrgb_gray_convert_dspr2
+ *
+ * Colorspace conversion RGB -> GRAY
+ */
+
+.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \
+ r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_GRAY r, g, b, inptr
+ lbu \r, \r_offs(\inptr)
+ lbu \g, \g_offs(\inptr)
+ lbu \b, \b_offs(\inptr)
+ addiu \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = input_buf
+ * a2 = output_buf
+ * a3 = output_row
+ * 16(sp) = num_rows
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ li s0, 0x4c8b /* s0 = FIX(0.29900) */
+ li s1, 0x9646 /* s1 = FIX(0.58700) */
+ li s2, 0x1d2f /* s2 = FIX(0.11400) */
+ li s7, 0x8000 /* s7 = FIX(0.50000) */
+ lw s6, 48(sp)
+ andi t7, a0, 3
+
+0:
+ addiu s6, -1 /* s6 = num_rows */
+ lw t0, 0(a1)
+ lw t1, 0(a2)
+ sll t3, a3, 2
+ lwx t1, t3(t1)
+ addiu a3, 1
+ addu t9, t1, a0
+ subu t8, t9, t7
+ beq t1, t8, 2f
+ nop
+
+1:
+ DO_RGB_TO_GRAY t3, t4, t5, t0
+ DO_RGB_TO_GRAY s3, s4, s5, t0
+
+ mtlo s7, $ac0
+ maddu $ac0, s2, t5
+ maddu $ac0, s1, t4
+ maddu $ac0, s0, t3
+ mtlo s7, $ac1
+ maddu $ac1, s2, s5
+ maddu $ac1, s1, s4
+ maddu $ac1, s0, s3
+ extr.w t6, $ac0, 16
+
+ DO_RGB_TO_GRAY t3, t4, t5, t0
+ DO_RGB_TO_GRAY s3, s4, s5, t0
+
+ mtlo s7, $ac0
+ maddu $ac0, s2, t5
+ maddu $ac0, s1, t4
+ extr.w t2, $ac1, 16
+ maddu $ac0, s0, t3
+ mtlo s7, $ac1
+ maddu $ac1, s2, s5
+ maddu $ac1, s1, s4
+ maddu $ac1, s0, s3
+ extr.w t5, $ac0, 16
+ sb t6, 0(t1)
+ sb t2, 1(t1)
+ extr.w t3, $ac1, 16
+ addiu t1, 4
+ sb t5, -2(t1)
+ sb t3, -1(t1)
+ bne t1, t8, 1b
+ nop
+
+2:
+ beqz t7, 4f
+ nop
+
+3:
+ DO_RGB_TO_GRAY t3, t4, t5, t0
+
+ mtlo s7, $ac0
+ maddu $ac0, s2, t5
+ maddu $ac0, s1, t4
+ maddu $ac0, s0, t3
+ extr.w t6, $ac0, 16
+ sb t6, 0(t1)
+ addiu t1, 1
+ bne t1, t9, 3b
+ nop
+
+4:
+ bgtz s6, 0b
+ addiu a1, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_\colorid\()_gray_convert_dspr2)
+
+.purgem DO_RGB_TO_GRAY
+
+.endm
+
+/*-------------------------------------id -- pix R G B */
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_merged_upsample_dspr2
+ * jsimd_h2v2_extrgb_merged_upsample_dspr2
+ * jsimd_h2v2_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v2_extbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v2_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v2 upsample routines
+ */
+.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
+ r1_offs, g1_offs, \
+ b1_offs, a1_offs, \
+ r2_offs, g2_offs, \
+ b2_offs, a2_offs
+
+.macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
+ scratch5 outptr
+ sb \scratch0, \r1_offs(\outptr)
+ sb \scratch1, \g1_offs(\outptr)
+ sb \scratch2, \b1_offs(\outptr)
+ sb \scratch3, \r2_offs(\outptr)
+ sb \scratch4, \g2_offs(\outptr)
+ sb \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+ li \scratch0, 0xFF
+ sb \scratch0, \a1_offs(\outptr)
+ sb \scratch0, \a2_offs(\outptr)
+.endif
+ addiu \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr
+ sb \scratch0, \r1_offs(\outptr)
+ sb \scratch1, \g1_offs(\outptr)
+ sb \scratch2, \b1_offs(\outptr)
+
+.if (\pixel_size == 8)
+ li t0, 0xFF
+ sb t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0 = cinfo->output_width
+ * a1 = input_buf
+ * a2 = in_row_group_ctr
+ * a3 = output_buf
+ * 16(sp) = cinfo->sample_range_limit
+ */
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+ lw t9, 56(sp) /* cinfo->sample_range_limit */
+ lw v0, 0(a1)
+ lw v1, 4(a1)
+ lw t0, 8(a1)
+ sll t1, a2, 3
+ addiu t2, t1, 4
+ sll t3, a2, 2
+ lw t4, 0(a3) /* t4 = output_buf[0] */
+ lwx t1, t1(v0) /* t1 = input_buf[0][in_row_group_ctr*2] */
+ lwx t2, t2(v0) /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */
+ lwx t5, t3(v1) /* t5 = input_buf[1][in_row_group_ctr] */
+ lwx t6, t3(t0) /* t6 = input_buf[2][in_row_group_ctr] */
+ lw t7, 4(a3) /* t7 = output_buf[1] */
+ li s1, 0xe6ea
+ addiu t8, s1, 0x7fff /* t8 = 0x166e9 [FIX(1.40200)] */
+ addiu s0, t8, 0x5eb9 /* s0 = 0x1c5a2 [FIX(1.77200)] */
+ addiu s1, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
+ xori s2, s1, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */
+ srl t3, a0, 1
+ blez t3, 2f
+ addu t0, t5, t3 /* t0 = end address */
+ 1:
+ lbu t3, 0(t5)
+ lbu s3, 0(t6)
+ addiu t5, t5, 1
+ addiu t3, t3, -128 /* (cb - 128) */
+ addiu s3, s3, -128 /* (cr - 128) */
+ mult $ac1, s1, t3
+ madd $ac1, s2, s3
+ sll s3, s3, 15
+ sll t3, t3, 15
+ mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
+ extr_r.w s5, $ac1, 16
+ mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
+ lbu v0, 0(t1)
+ addiu t6, t6, 1
+ addiu t1, t1, 2
+ addu t3, v0, s4 /* y+cred */
+ addu s3, v0, s5 /* y+cgreen */
+ addu v1, v0, s6 /* y+cblue */
+ addu t3, t9, t3 /* y+cred */
+ addu s3, t9, s3 /* y+cgreen */
+ addu v1, t9, v1 /* y+cblue */
+ lbu AT, 0(t3)
+ lbu s7, 0(s3)
+ lbu ra, 0(v1)
+ lbu v0, -1(t1)
+ addu t3, v0, s4 /* y+cred */
+ addu s3, v0, s5 /* y+cgreen */
+ addu v1, v0, s6 /* y+cblue */
+ addu t3, t9, t3 /* y+cred */
+ addu s3, t9, s3 /* y+cgreen */
+ addu v1, t9, v1 /* y+cblue */
+ lbu t3, 0(t3)
+ lbu s3, 0(s3)
+ lbu v1, 0(v1)
+ lbu v0, 0(t2)
+
+ STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
+
+ addu t3, v0, s4 /* y+cred */
+ addu s3, v0, s5 /* y+cgreen */
+ addu v1, v0, s6 /* y+cblue */
+ addu t3, t9, t3 /* y+cred */
+ addu s3, t9, s3 /* y+cgreen */
+ addu v1, t9, v1 /* y+cblue */
+ lbu AT, 0(t3)
+ lbu s7, 0(s3)
+ lbu ra, 0(v1)
+ lbu v0, 1(t2)
+ addiu t2, t2, 2
+ addu t3, v0, s4 /* y+cred */
+ addu s3, v0, s5 /* y+cgreen */
+ addu v1, v0, s6 /* y+cblue */
+ addu t3, t9, t3 /* y+cred */
+ addu s3, t9, s3 /* y+cgreen */
+ addu v1, t9, v1 /* y+cblue */
+ lbu t3, 0(t3)
+ lbu s3, 0(s3)
+ lbu v1, 0(v1)
+
+ STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
+
+ bne t0, t5, 1b
+ nop
+2:
+ andi t0, a0, 1
+ beqz t0, 4f
+ lbu t3, 0(t5)
+ lbu s3, 0(t6)
+ addiu t3, t3, -128 /* (cb - 128) */
+ addiu s3, s3, -128 /* (cr - 128) */
+ mult $ac1, s1, t3
+ madd $ac1, s2, s3
+ sll s3, s3, 15
+ sll t3, t3, 15
+ lbu v0, 0(t1)
+ extr_r.w s5, $ac1, 16
+ mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
+ mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
+ addu t3, v0, s4 /* y+cred */
+ addu s3, v0, s5 /* y+cgreen */
+ addu v1, v0, s6 /* y+cblue */
+ addu t3, t9, t3 /* y+cred */
+ addu s3, t9, s3 /* y+cgreen */
+ addu v1, t9, v1 /* y+cblue */
+ lbu t3, 0(t3)
+ lbu s3, 0(s3)
+ lbu v1, 0(v1)
+ lbu v0, 0(t2)
+
+ STORE_H2V2_1_PIXEL t3, s3, v1, t4
+
+ addu t3, v0, s4 /* y+cred */
+ addu s3, v0, s5 /* y+cgreen */
+ addu v1, v0, s6 /* y+cblue */
+ addu t3, t9, t3 /* y+cred */
+ addu s3, t9, s3 /* y+cgreen */
+ addu v1, t9, v1 /* y+cblue */
+ lbu t3, 0(t3)
+ lbu s3, 0(s3)
+ lbu v1, 0(v1)
+
+ STORE_H2V2_1_PIXEL t3, s3, v1, t7
+4:
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+ j ra
+ nop
+
+END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V2_1_PIXEL
+.purgem STORE_H2V2_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v1_merged_upsample_dspr2
+ * jsimd_h2v1_extrgb_merged_upsample_dspr2
+ * jsimd_h2v1_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v1_extbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v1_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v1 upsample routines
+ */
+
+.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
+ r1_offs, g1_offs, \
+ b1_offs, a1_offs, \
+ r2_offs, g2_offs, \
+ b2_offs, a2_offs
+
+.macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
+ scratch5 outptr
+ sb \scratch0, \r1_offs(\outptr)
+ sb \scratch1, \g1_offs(\outptr)
+ sb \scratch2, \b1_offs(\outptr)
+ sb \scratch3, \r2_offs(\outptr)
+ sb \scratch4, \g2_offs(\outptr)
+ sb \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+ li t0, 0xFF
+ sb t0, \a1_offs(\outptr)
+ sb t0, \a2_offs(\outptr)
+.endif
+ addiu \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr
+ sb \scratch0, \r1_offs(\outptr)
+ sb \scratch1, \g1_offs(\outptr)
+ sb \scratch2, \b1_offs(\outptr)
+.if (\pixel_size == 8)
+ li t0, 0xFF
+ sb t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0 = cinfo->output_width
+ * a1 = input_buf
+ * a2 = in_row_group_ctr
+ * a3 = output_buf
+ * 16(sp) = range_limit
+ */
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+ li t0, 0xe6ea
+ lw t1, 0(a1) /* t1 = input_buf[0] */
+ lw t2, 4(a1) /* t2 = input_buf[1] */
+ lw t3, 8(a1) /* t3 = input_buf[2] */
+ lw t8, 56(sp) /* t8 = range_limit */
+ addiu s1, t0, 0x7fff /* s1 = 0x166e9 [FIX(1.40200)] */
+ addiu s2, s1, 0x5eb9 /* s2 = 0x1c5a2 [FIX(1.77200)] */
+ addiu s0, t0, 0x9916 /* s0 = 0x8000 */
+ addiu s4, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
+ xori s3, s4, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */
+ srl t0, a0, 1
+ sll t4, a2, 2
+ lwx s5, t4(t1) /* s5 = inptr0 */
+ lwx s6, t4(t2) /* s6 = inptr1 */
+ lwx s7, t4(t3) /* s7 = inptr2 */
+ lw t7, 0(a3) /* t7 = outptr */
+ blez t0, 2f
+ addu t9, s6, t0 /* t9 = end address */
+1:
+ lbu t2, 0(s6) /* t2 = cb */
+ lbu t0, 0(s7) /* t0 = cr */
+ lbu t1, 0(s5) /* t1 = y */
+ addiu t2, t2, -128 /* t2 = cb - 128 */
+ addiu t0, t0, -128 /* t0 = cr - 128 */
+ mult $ac1, s4, t2
+ madd $ac1, s3, t0
+ sll t0, t0, 15
+ sll t2, t2, 15
+ mulq_rs.w t0, s1, t0 /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */
+ extr_r.w t5, $ac1, 16
+ mulq_rs.w t6, s2, t2 /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */
+ addiu s7, s7, 1
+ addiu s6, s6, 1
+ addu t2, t1, t0 /* t2 = y + cred */
+ addu t3, t1, t5 /* t3 = y + cgreen */
+ addu t4, t1, t6 /* t4 = y + cblue */
+ addu t2, t8, t2
+ addu t3, t8, t3
+ addu t4, t8, t4
+ lbu t1, 1(s5)
+ lbu v0, 0(t2)
+ lbu v1, 0(t3)
+ lbu ra, 0(t4)
+ addu t2, t1, t0
+ addu t3, t1, t5
+ addu t4, t1, t6
+ addu t2, t8, t2
+ addu t3, t8, t3
+ addu t4, t8, t4
+ lbu t2, 0(t2)
+ lbu t3, 0(t3)
+ lbu t4, 0(t4)
+
+ STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
+
+ bne t9, s6, 1b
+ addiu s5, s5, 2
+2:
+ andi t0, a0, 1
+ beqz t0, 4f
+ nop
+3:
+ lbu t2, 0(s6)
+ lbu t0, 0(s7)
+ lbu t1, 0(s5)
+ addiu t2, t2, -128 /* (cb - 128) */
+ addiu t0, t0, -128 /* (cr - 128) */
+ mul t3, s4, t2
+ mul t4, s3, t0
+ sll t0, t0, 15
+ sll t2, t2, 15
+ mulq_rs.w t0, s1, t0 /* (C1*cr + ONE_HALF)>> SCALEBITS */
+ mulq_rs.w t6, s2, t2 /* (C2*cb + ONE_HALF)>> SCALEBITS */
+ addu t3, t3, s0
+ addu t3, t4, t3
+ sra t5, t3, 16 /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */
+ addu t2, t1, t0 /* y + cred */
+ addu t3, t1, t5 /* y + cgreen */
+ addu t4, t1, t6 /* y + cblue */
+ addu t2, t8, t2
+ addu t3, t8, t3
+ addu t4, t8, t4
+ lbu t2, 0(t2)
+ lbu t3, 0(t3)
+ lbu t4, 0(t4)
+
+ STORE_H2V1_1_PIXEL t2, t3, t4, t7
+4:
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+ j ra
+ nop
+
+END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V1_1_PIXEL
+.purgem STORE_H2V1_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_fancy_upsample_dspr2
+ *
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+ */
+LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+ li s4, 0
+ lw s2, 0(a3) /* s2 = *output_data_ptr */
+0:
+ li t9, 2
+ lw s1, -4(a2) /* s1 = inptr1 */
+
+1:
+ lw s0, 0(a2) /* s0 = inptr0 */
+ lwx s3, s4(s2)
+ addiu s5, a1, -2 /* s5 = downsampled_width - 2 */
+ srl t4, s5, 1
+ sll t4, t4, 1
+ lbu t0, 0(s0)
+ lbu t1, 1(s0)
+ lbu t2, 0(s1)
+ lbu t3, 1(s1)
+ addiu s0, 2
+ addiu s1, 2
+ addu t8, s0, t4 /* t8 = end address */
+ andi s5, s5, 1 /* s5 = residual */
+ sll t4, t0, 1
+ sll t6, t1, 1
+ addu t0, t0, t4 /* t0 = (*inptr0++) * 3 */
+ addu t1, t1, t6 /* t1 = (*inptr0++) * 3 */
+ addu t7, t0, t2 /* t7 = thiscolsum */
+ addu t6, t1, t3 /* t5 = nextcolsum */
+ sll t0, t7, 2 /* t0 = thiscolsum * 4 */
+ subu t1, t0, t7 /* t1 = thiscolsum * 3 */
+ shra_r.w t0, t0, 4
+ addiu t1, 7
+ addu t1, t1, t6
+ srl t1, t1, 4
+ sb t0, 0(s3)
+ sb t1, 1(s3)
+ beq t8, s0, 22f /* skip to final iteration if width == 3 */
+ addiu s3, 2
+2:
+ lh t0, 0(s0) /* t0 = A3|A2 */
+ lh t2, 0(s1) /* t2 = B3|B2 */
+ addiu s0, 2
+ addiu s1, 2
+ preceu.ph.qbr t0, t0 /* t0 = 0|A3|0|A2 */
+ preceu.ph.qbr t2, t2 /* t2 = 0|B3|0|B2 */
+ shll.ph t1, t0, 1
+ sll t3, t6, 1
+ addu.ph t0, t1, t0 /* t0 = A3*3|A2*3 */
+ addu t3, t3, t6 /* t3 = this * 3 */
+ addu.ph t0, t0, t2 /* t0 = next2|next1 */
+ addu t1, t3, t7
+ andi t7, t0, 0xFFFF /* t7 = next1 */
+ sll t2, t7, 1
+ addu t2, t7, t2 /* t2 = next1*3 */
+ addu t4, t2, t6
+ srl t6, t0, 16 /* t6 = next2 */
+ shra_r.w t1, t1, 4 /* t1 = (this*3 + last + 8) >> 4 */
+ addu t0, t3, t7
+ addiu t0, 7
+ srl t0, t0, 4 /* t0 = (this*3 + next1 + 7) >> 4 */
+ shra_r.w t4, t4, 4 /* t3 = (next1*3 + this + 8) >> 4 */
+ addu t2, t2, t6
+ addiu t2, 7
+ srl t2, t2, 4 /* t2 = (next1*3 + next2 + 7) >> 4 */
+ sb t1, 0(s3)
+ sb t0, 1(s3)
+ sb t4, 2(s3)
+ sb t2, 3(s3)
+ bne t8, s0, 2b
+ addiu s3, 4
+22:
+ beqz s5, 4f
+ addu t8, s0, s5
+3:
+ lbu t0, 0(s0)
+ lbu t2, 0(s1)
+ addiu s0, 1
+ addiu s1, 1
+ sll t3, t6, 1
+ sll t1, t0, 1
+ addu t1, t0, t1 /* t1 = inptr0 * 3 */
+ addu t3, t3, t6 /* t3 = thiscolsum * 3 */
+ addu t5, t1, t2
+ addu t1, t3, t7
+ shra_r.w t1, t1, 4
+ addu t0, t3, t5
+ addiu t0, 7
+ srl t0, t0, 4
+ sb t1, 0(s3)
+ sb t0, 1(s3)
+ addiu s3, 2
+ move t7, t6
+ bne t8, s0, 3b
+ move t6, t5
+4:
+ sll t0, t6, 2 /* t0 = thiscolsum * 4 */
+ subu t1, t0, t6 /* t1 = thiscolsum * 3 */
+ addu t1, t1, t7
+ addiu s4, 4
+ shra_r.w t1, t1, 4
+ addiu t0, 7
+ srl t0, t0, 4
+ sb t1, 0(s3)
+ sb t0, 1(s3)
+ addiu t9, -1
+ addiu s3, 2
+ bnez t9, 1b
+ lw s1, 4(a2)
+ srl t0, s4, 2
+ subu t0, a0, t0
+ bgtz t0, 0b
+ addiu a2, 4
+
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+ j ra
+ nop
+END(jsimd_h2v2_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+ .set at
+
+ beqz a0, 3f
+ sll t0, a0, 2
+ lw s1, 0(a3)
+ li s3, 0x10001
+ addu s0, s1, t0
+0:
+ addiu t8, a1, -2
+ srl t9, t8, 2
+ lw t7, 0(a2)
+ lw s2, 0(s1)
+ lbu t0, 0(t7)
+ lbu t1, 1(t7) /* t1 = inptr[1] */
+ sll t2, t0, 1
+ addu t2, t2, t0 /* t2 = invalue*3 */
+ addu t2, t2, t1
+ shra_r.w t2, t2, 2
+ sb t0, 0(s2)
+ sb t2, 1(s2)
+ beqz t9, 11f
+ addiu s2, 2
+1:
+ ulw t0, 0(t7) /* t0 = |P3|P2|P1|P0| */
+ ulw t1, 1(t7)
+ ulh t2, 4(t7) /* t2 = |0|0|P5|P4| */
+ preceu.ph.qbl t3, t0 /* t3 = |0|P3|0|P2| */
+ preceu.ph.qbr t0, t0 /* t0 = |0|P1|0|P0| */
+ preceu.ph.qbr t2, t2 /* t2 = |0|P5|0|P4| */
+ preceu.ph.qbl t4, t1 /* t4 = |0|P4|0|P3| */
+ preceu.ph.qbr t1, t1 /* t1 = |0|P2|0|P1| */
+ shll.ph t5, t4, 1
+ shll.ph t6, t1, 1
+ addu.ph t5, t5, t4 /* t5 = |P4*3|P3*3| */
+ addu.ph t6, t6, t1 /* t6 = |P2*3|P1*3| */
+ addu.ph t4, t3, s3
+ addu.ph t0, t0, s3
+ addu.ph t4, t4, t5
+ addu.ph t0, t0, t6
+ shrl.ph t4, t4, 2 /* t4 = |0|P3|0|P2| */
+ shrl.ph t0, t0, 2 /* t0 = |0|P1|0|P0| */
+ addu.ph t2, t2, t5
+ addu.ph t3, t3, t6
+ shra_r.ph t2, t2, 2 /* t2 = |0|P5|0|P4| */
+ shra_r.ph t3, t3, 2 /* t3 = |0|P3|0|P2| */
+ shll.ph t2, t2, 8
+ shll.ph t3, t3, 8
+ or t2, t4, t2
+ or t3, t3, t0
+ addiu t9, -1
+ usw t3, 0(s2)
+ usw t2, 4(s2)
+ addiu s2, 8
+ bgtz t9, 1b
+ addiu t7, 4
+11:
+ andi t8, 3
+ beqz t8, 22f
+ addiu t7, 1
+
+2:
+ lbu t0, 0(t7)
+ addiu t7, 1
+ sll t1, t0, 1
+ addu t2, t0, t1 /* t2 = invalue */
+ lbu t3, -2(t7)
+ lbu t4, 0(t7)
+ addiu t3, 1
+ addiu t4, 2
+ addu t3, t3, t2
+ addu t4, t4, t2
+ srl t3, 2
+ srl t4, 2
+ sb t3, 0(s2)
+ sb t4, 1(s2)
+ addiu t8, -1
+ bgtz t8, 2b
+ addiu s2, 2
+
+22:
+ lbu t0, 0(t7)
+ lbu t2, -1(t7)
+ sll t1, t0, 1
+ addu t1, t1, t0 /* t1 = invalue * 3 */
+ addu t1, t1, t2
+ addiu t1, 1
+ srl t1, t1, 2
+ sb t1, 0(s2)
+ sb t0, 1(s2)
+ addiu s1, 4
+ bne s1, s0, 0b
+ addiu a2, 4
+3:
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+ j ra
+ nop
+END(jsimd_h2v1_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = cinfo->max_v_samp_factor
+ * a2 = compptr->v_samp_factor
+ * a3 = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
+
+ beqz a2, 7f
+ lw s1, 44(sp) /* s1 = output_data */
+ lw s0, 40(sp) /* s0 = input_data */
+ srl s2, a0, 2
+ andi t9, a0, 2
+ srl t7, t9, 1
+ addu s2, t7, s2
+ sll t0, a3, 3 /* t0 = width_in_blocks*DCT */
+ srl t7, t0, 1
+ subu s2, t7, s2
+0:
+ andi t6, a0, 1 /* t6 = temp_index */
+ addiu t6, -1
+ lw t4, 0(s1) /* t4 = outptr */
+ lw t5, 0(s0) /* t5 = inptr0 */
+ li s3, 0 /* s3 = bias */
+ srl t7, a0, 1 /* t7 = image_width1 */
+ srl s4, t7, 2
+ andi t8, t7, 3
+1:
+ ulhu t0, 0(t5)
+ ulhu t1, 2(t5)
+ ulhu t2, 4(t5)
+ ulhu t3, 6(t5)
+ raddu.w.qb t0, t0
+ raddu.w.qb t1, t1
+ raddu.w.qb t2, t2
+ raddu.w.qb t3, t3
+ shra.ph t0, t0, 1
+ shra_r.ph t1, t1, 1
+ shra.ph t2, t2, 1
+ shra_r.ph t3, t3, 1
+ sb t0, 0(t4)
+ sb t1, 1(t4)
+ sb t2, 2(t4)
+ sb t3, 3(t4)
+ addiu s4, -1
+ addiu t4, 4
+ bgtz s4, 1b
+ addiu t5, 8
+ beqz t8, 3f
+ addu s4, t4, t8
+2:
+ ulhu t0, 0(t5)
+ raddu.w.qb t0, t0
+ addqh.w t0, t0, s3
+ xori s3, s3, 1
+ sb t0, 0(t4)
+ addiu t4, 1
+ bne t4, s4, 2b
+ addiu t5, 2
+3:
+ lbux t1, t6(t5)
+ sll t1, 1
+ addqh.w t2, t1, s3 /* t2 = pixval1 */
+ xori s3, s3, 1
+ addqh.w t3, t1, s3 /* t3 = pixval2 */
+ blez s2, 5f
+ append t3, t2, 8
+ addu t5, t4, s2 /* t5 = loop_end2 */
+4:
+ ush t3, 0(t4)
+ addiu s2, -1
+ bgtz s2, 4b
+ addiu t4, 2
+5:
+ beqz t9, 6f
+ nop
+ sb t2, 0(t4)
+6:
+ addiu s1, 4
+ addiu a2, -1
+ bnez a2, 0b
+ addiu s0, 4
+7:
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
+
+ j ra
+ nop
+END(jsimd_h2v1_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
+/*
+ * a0 = cinfo->image_width
+ * a1 = cinfo->max_v_samp_factor
+ * a2 = compptr->v_samp_factor
+ * a3 = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ beqz a2, 8f
+ lw s1, 52(sp) /* s1 = output_data */
+ lw s0, 48(sp) /* s0 = input_data */
+
+ andi t6, a0, 1 /* t6 = temp_index */
+ addiu t6, -1
+ srl t7, a0, 1 /* t7 = image_width1 */
+ srl s4, t7, 2
+ andi t8, t7, 3
+ andi t9, a0, 2
+ srl s2, a0, 2
+ srl t7, t9, 1
+ addu s2, t7, s2
+ sll t0, a3, 3 /* s2 = width_in_blocks*DCT */
+ srl t7, t0, 1
+ subu s2, t7, s2
+0:
+ lw t4, 0(s1) /* t4 = outptr */
+ lw t5, 0(s0) /* t5 = inptr0 */
+ lw s7, 4(s0) /* s7 = inptr1 */
+ li s6, 1 /* s6 = bias */
+2:
+ ulw t0, 0(t5) /* t0 = |P3|P2|P1|P0| */
+ ulw t1, 0(s7) /* t1 = |Q3|Q2|Q1|Q0| */
+ ulw t2, 4(t5)
+ ulw t3, 4(s7)
+ precrq.ph.w t7, t0, t1 /* t2 = |P3|P2|Q3|Q2| */
+ ins t0, t1, 16, 16 /* t0 = |Q1|Q0|P1|P0| */
+ raddu.w.qb t1, t7
+ raddu.w.qb t0, t0
+ shra_r.w t1, t1, 2
+ addiu t0, 1
+ srl t0, 2
+ precrq.ph.w t7, t2, t3
+ ins t2, t3, 16, 16
+ raddu.w.qb t7, t7
+ raddu.w.qb t2, t2
+ shra_r.w t7, t7, 2
+ addiu t2, 1
+ srl t2, 2
+ sb t0, 0(t4)
+ sb t1, 1(t4)
+ sb t2, 2(t4)
+ sb t7, 3(t4)
+ addiu t4, 4
+ addiu t5, 8
+ addiu s4, s4, -1
+ bgtz s4, 2b
+ addiu s7, 8
+ beqz t8, 4f
+ addu t8, t4, t8
+3:
+ ulhu t0, 0(t5)
+ ulhu t1, 0(s7)
+ ins t0, t1, 16, 16
+ raddu.w.qb t0, t0
+ addu t0, t0, s6
+ srl t0, 2
+ xori s6, s6, 3
+ sb t0, 0(t4)
+ addiu t5, 2
+ addiu t4, 1
+ bne t8, t4, 3b
+ addiu s7, 2
+4:
+ lbux t1, t6(t5)
+ sll t1, 1
+ lbux t0, t6(s7)
+ sll t0, 1
+ addu t1, t1, t0
+ addu t3, t1, s6
+ srl t0, t3, 2 /* t2 = pixval1 */
+ xori s6, s6, 3
+ addu t2, t1, s6
+ srl t1, t2, 2 /* t3 = pixval2 */
+ blez s2, 6f
+ append t1, t0, 8
+5:
+ ush t1, 0(t4)
+ addiu s2, -1
+ bgtz s2, 5b
+ addiu t4, 2
+6:
+ beqz t9, 7f
+ nop
+ sb t0, 0(t4)
+7:
+ addiu s1, 4
+ addiu a2, -1
+ bnez a2, 0b
+ addiu s0, 8
+8:
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_h2v2_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
+/*
+ * a0 = input_data
+ * a1 = output_data
+ * a2 = compptr->v_samp_factor
+ * a3 = cinfo->max_v_samp_factor
+ * 16(sp) = cinfo->smoothing_factor
+ * 20(sp) = compptr->width_in_blocks
+ * 24(sp) = cinfo->image_width
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw s7, 52(sp) /* compptr->width_in_blocks */
+ lw s0, 56(sp) /* cinfo->image_width */
+ lw s6, 48(sp) /* cinfo->smoothing_factor */
+ sll s7, 3 /* output_cols = width_in_blocks * DCTSIZE */
+ sll v0, s7, 1
+ subu v0, v0, s0
+ blez v0, 2f
+ move v1, zero
+ addiu t0, a3, 2 /* t0 = cinfo->max_v_samp_factor + 2 */
+0:
+ addiu t1, a0, -4
+ sll t2, v1, 2
+ lwx t1, t2(t1)
+ move t3, v0
+ addu t1, t1, s0
+ lbu t2, -1(t1)
+1:
+ addiu t3, t3, -1
+ sb t2, 0(t1)
+ bgtz t3, 1b
+ addiu t1, t1, 1
+ addiu v1, v1, 1
+ bne v1, t0, 0b
+ nop
+2:
+ li v0, 80
+ mul v0, s6, v0
+ li v1, 16384
+ move t4, zero
+ move t5, zero
+ subu t6, v1, v0 /* t6 = 16384 - tmp_smoot_f * 80 */
+ sll t7, s6, 4 /* t7 = tmp_smoot_f * 16 */
+3:
+/* Special case for first column: pretend column -1 is same as column 0 */
+ sll v0, t4, 2
+ lwx t8, v0(a1) /* outptr = output_data[outrow] */
+ sll v1, t5, 2
+ addiu t9, v1, 4
+ addiu s0, v1, -4
+ addiu s1, v1, 8
+ lwx s2, v1(a0) /* inptr0 = input_data[inrow] */
+ lwx t9, t9(a0) /* inptr1 = input_data[inrow+1] */
+ lwx s0, s0(a0) /* above_ptr = input_data[inrow-1] */
+ lwx s1, s1(a0) /* below_ptr = input_data[inrow+2] */
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, 0(s2)
+ lbu v1, 2(s2)
+ lbu t0, 0(t9)
+ lbu t1, 2(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 2(s0)
+ addu t0, t0, v0
+ lbu t3, 2(s1)
+ addu s3, t0, s3
+ lbu v0, 0(s0)
+ lbu t0, 0(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ addu t0, t0, v0
+ addu s3, t0, s3
+ madd $ac1, s3, t7
+ extr_r.w v0, $ac1, 16
+ addiu t8, t8, 1
+ addiu s2, s2, 2
+ addiu t9, t9, 2
+ addiu s0, s0, 2
+ addiu s1, s1, 2
+ sb v0, -1(t8)
+ addiu s4, s7, -2
+ and s4, s4, 3
+ addu s5, s4, t8 /* end address */
+4:
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, -1(s2)
+ lbu v1, 2(s2)
+ lbu t0, -1(t9)
+ lbu t1, 2(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 2(s0)
+ addu t0, t0, v0
+ lbu t3, 2(s1)
+ addu s3, t0, s3
+ lbu v0, -1(s0)
+ lbu t0, -1(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ addu t0, t0, v0
+ addu s3, t0, s3
+ madd $ac1, s3, t7
+ extr_r.w t2, $ac1, 16
+ addiu t8, t8, 1
+ addiu s2, s2, 2
+ addiu t9, t9, 2
+ addiu s0, s0, 2
+ sb t2, -1(t8)
+ bne s5, t8, 4b
+ addiu s1, s1, 2
+ addiu s5, s7, -2
+ subu s5, s5, s4
+ addu s5, s5, t8 /* end address */
+5:
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, -1(s2)
+ lbu v1, 2(s2)
+ lbu t0, -1(t9)
+ lbu t1, 2(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 2(s0)
+ addu t0, t0, v0
+ lbu t3, 2(s1)
+ addu s3, t0, s3
+ lbu v0, -1(s0)
+ lbu t0, -1(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ lh v1, 2(t9)
+ addu t0, t0, v0
+ lh v0, 2(s2)
+ addu s3, t0, s3
+ lh t0, 2(s0)
+ lh t1, 2(s1)
+ madd $ac1, s3, t7
+ extr_r.w t2, $ac1, 16
+ ins t0, t1, 16, 16
+ ins v0, v1, 16, 16
+ raddu.w.qb s3, t0
+ lbu v1, 4(s2)
+ lbu t0, 1(t9)
+ lbu t1, 4(t9)
+ sb t2, 0(t8)
+ raddu.w.qb t3, v0
+ lbu v0, 1(s2)
+ addu t0, t0, t1
+ mult $ac1, t3, t6
+ addu v0, v0, v1
+ lbu t2, 4(s0)
+ addu t0, t0, v0
+ lbu v0, 1(s0)
+ addu s3, t0, s3
+ lbu t0, 1(s1)
+ lbu t3, 4(s1)
+ addu v0, v0, t2
+ sll s3, s3, 1
+ addu t0, t0, t3
+ lh v1, 4(t9)
+ addu t0, t0, v0
+ lh v0, 4(s2)
+ addu s3, t0, s3
+ lh t0, 4(s0)
+ lh t1, 4(s1)
+ madd $ac1, s3, t7
+ extr_r.w t2, $ac1, 16
+ ins t0, t1, 16, 16
+ ins v0, v1, 16, 16
+ raddu.w.qb s3, t0
+ lbu v1, 6(s2)
+ lbu t0, 3(t9)
+ lbu t1, 6(t9)
+ sb t2, 1(t8)
+ raddu.w.qb t3, v0
+ lbu v0, 3(s2)
+ addu t0, t0, t1
+ mult $ac1, t3, t6
+ addu v0, v0, v1
+ lbu t2, 6(s0)
+ addu t0, t0, v0
+ lbu v0, 3(s0)
+ addu s3, t0, s3
+ lbu t0, 3(s1)
+ lbu t3, 6(s1)
+ addu v0, v0, t2
+ sll s3, s3, 1
+ addu t0, t0, t3
+ lh v1, 6(t9)
+ addu t0, t0, v0
+ lh v0, 6(s2)
+ addu s3, t0, s3
+ lh t0, 6(s0)
+ lh t1, 6(s1)
+ madd $ac1, s3, t7
+ extr_r.w t3, $ac1, 16
+ ins t0, t1, 16, 16
+ ins v0, v1, 16, 16
+ raddu.w.qb s3, t0
+ lbu v1, 8(s2)
+ lbu t0, 5(t9)
+ lbu t1, 8(t9)
+ sb t3, 2(t8)
+ raddu.w.qb t2, v0
+ lbu v0, 5(s2)
+ addu t0, t0, t1
+ mult $ac1, t2, t6
+ addu v0, v0, v1
+ lbu t2, 8(s0)
+ addu t0, t0, v0
+ lbu v0, 5(s0)
+ addu s3, t0, s3
+ lbu t0, 5(s1)
+ lbu t3, 8(s1)
+ addu v0, v0, t2
+ sll s3, s3, 1
+ addu t0, t0, t3
+ addiu t8, t8, 4
+ addu t0, t0, v0
+ addiu s2, s2, 8
+ addu s3, t0, s3
+ addiu t9, t9, 8
+ madd $ac1, s3, t7
+ extr_r.w t1, $ac1, 16
+ addiu s0, s0, 8
+ addiu s1, s1, 8
+ bne s5, t8, 5b
+ sb t1, -1(t8)
+/* Special case for last column */
+ lh v0, 0(s2)
+ lh v1, 0(t9)
+ lh t0, 0(s0)
+ lh t1, 0(s1)
+ ins v0, v1, 16, 16
+ ins t0, t1, 16, 16
+ raddu.w.qb t2, v0
+ raddu.w.qb s3, t0
+ lbu v0, -1(s2)
+ lbu v1, 1(s2)
+ lbu t0, -1(t9)
+ lbu t1, 1(t9)
+ addu v0, v0, v1
+ mult $ac1, t2, t6
+ addu t0, t0, t1
+ lbu t2, 1(s0)
+ addu t0, t0, v0
+ lbu t3, 1(s1)
+ addu s3, t0, s3
+ lbu v0, -1(s0)
+ lbu t0, -1(s1)
+ sll s3, s3, 1
+ addu v0, v0, t2
+ addu t0, t0, t3
+ addu t0, t0, v0
+ addu s3, t0, s3
+ madd $ac1, s3, t7
+ extr_r.w t0, $ac1, 16
+ addiu t5, t5, 2
+ sb t0, 0(t8)
+ addiu t4, t4, 1
+ bne t4, a2, 3b
+ addiu t5, t5, 2
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_h2v2_smooth_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_int_upsample_dspr2)
+/*
+ * a0 = upsample->h_expand[compptr->component_index]
+ * a1 = upsample->v_expand[compptr->component_index]
+ * a2 = input_data
+ * a3 = output_data_ptr
+ * 16(sp) = cinfo->output_width
+ * 20(sp) = cinfo->max_v_samp_factor
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+ lw s0, 0(a3) /* s0 = output_data */
+ lw s1, 32(sp) /* s1 = cinfo->output_width */
+ lw s2, 36(sp) /* s2 = cinfo->max_v_samp_factor */
+ li t6, 0 /* t6 = inrow */
+ beqz s2, 10f
+ li s3, 0 /* s3 = outrow */
+0:
+ addu t0, a2, t6
+ addu t7, s0, s3
+ lw t3, 0(t0) /* t3 = inptr */
+ lw t8, 0(t7) /* t8 = outptr */
+ beqz s1, 4f
+ addu t5, t8, s1 /* t5 = outend */
+1:
+ lb t2, 0(t3) /* t2 = invalue = *inptr++ */
+ addiu t3, 1
+ beqz a0, 3f
+ move t0, a0 /* t0 = h_expand */
+2:
+ sb t2, 0(t8)
+ addiu t0, -1
+ bgtz t0, 2b
+ addiu t8, 1
+3:
+ bgt t5, t8, 1b
+ nop
+4:
+ addiu t9, a1, -1 /* t9 = v_expand - 1 */
+ blez t9, 9f
+ nop
+5:
+ lw t3, 0(s0)
+ lw t4, 4(s0)
+ subu t0, s1, 0xF
+ blez t0, 7f
+ addu t5, t3, s1 /* t5 = end address */
+ andi t7, s1, 0xF /* t7 = residual */
+ subu t8, t5, t7
+6:
+ ulw t0, 0(t3)
+ ulw t1, 4(t3)
+ ulw t2, 8(t3)
+ usw t0, 0(t4)
+ ulw t0, 12(t3)
+ usw t1, 4(t4)
+ usw t2, 8(t4)
+ usw t0, 12(t4)
+ addiu t3, 16
+ bne t3, t8, 6b
+ addiu t4, 16
+ beqz t7, 8f
+ nop
+7:
+ lbu t0, 0(t3)
+ sb t0, 0(t4)
+ addiu t3, 1
+ bne t3, t5, 7b
+ addiu t4, 1
+8:
+ addiu t9, -1
+ bgtz t9, 5b
+ addiu s0, 8
+9:
+ addu s3, s3, a1
+ bne s3, s2, 0b
+ addiu t6, 1
+10:
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+ j ra
+ nop
+END(jsimd_int_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+ lw t7, 0(a3) /* t7 = output_data */
+ andi t8, a1, 0xf /* t8 = residual */
+ sll t0, a0, 2
+ blez a0, 4f
+ addu t9, t7, t0 /* t9 = output_data end address */
+0:
+ lw t5, 0(t7) /* t5 = outptr */
+ lw t6, 0(a2) /* t6 = inptr */
+ addu t3, t5, a1 /* t3 = outptr + output_width (end address) */
+ subu t3, t8 /* t3 = end address - residual */
+ beq t5, t3, 2f
+ move t4, t8
+1:
+ ulw t0, 0(t6) /* t0 = |P3|P2|P1|P0| */
+ ulw t2, 4(t6) /* t2 = |P7|P6|P5|P4| */
+ srl t1, t0, 16 /* t1 = |X|X|P3|P2| */
+ ins t0, t0, 16, 16 /* t0 = |P1|P0|P1|P0| */
+ ins t1, t1, 16, 16 /* t1 = |P3|P2|P3|P2| */
+ ins t0, t0, 8, 16 /* t0 = |P1|P1|P0|P0| */
+ ins t1, t1, 8, 16 /* t1 = |P3|P3|P2|P2| */
+ usw t0, 0(t5)
+ usw t1, 4(t5)
+ srl t0, t2, 16 /* t0 = |X|X|P7|P6| */
+ ins t2, t2, 16, 16 /* t2 = |P5|P4|P5|P4| */
+ ins t0, t0, 16, 16 /* t0 = |P7|P6|P7|P6| */
+ ins t2, t2, 8, 16 /* t2 = |P5|P5|P4|P4| */
+ ins t0, t0, 8, 16 /* t0 = |P7|P7|P6|P6| */
+ usw t2, 8(t5)
+ usw t0, 12(t5)
+ addiu t5, 16
+ bne t5, t3, 1b
+ addiu t6, 8
+ beqz t8, 3f
+ move t4, t8
+2:
+ lbu t1, 0(t6)
+ sb t1, 0(t5)
+ sb t1, 1(t5)
+ addiu t4, -2
+ addiu t6, 1
+ bgtz t4, 2b
+ addiu t5, 2
+3:
+ addiu t7, 4
+ bne t9, t7, 0b
+ addiu a2, 4
+4:
+ j ra
+ nop
+END(jsimd_h2v1_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+ lw t7, 0(a3)
+ blez a0, 7f
+ andi t9, a1, 0xf /* t9 = residual */
+0:
+ lw t6, 0(a2) /* t6 = inptr */
+ lw t5, 0(t7) /* t5 = outptr */
+ addu t8, t5, a1 /* t8 = outptr end address */
+ subu t8, t9 /* t8 = end address - residual */
+ beq t5, t8, 2f
+ move t4, t9
+1:
+ ulw t0, 0(t6)
+ srl t1, t0, 16
+ ins t0, t0, 16, 16
+ ins t0, t0, 8, 16
+ ins t1, t1, 16, 16
+ ins t1, t1, 8, 16
+ ulw t2, 4(t6)
+ usw t0, 0(t5)
+ usw t1, 4(t5)
+ srl t3, t2, 16
+ ins t2, t2, 16, 16
+ ins t2, t2, 8, 16
+ ins t3, t3, 16, 16
+ ins t3, t3, 8, 16
+ usw t2, 8(t5)
+ usw t3, 12(t5)
+ addiu t5, 16
+ bne t5, t8, 1b
+ addiu t6, 8
+ beqz t9, 3f
+ move t4, t9
+2:
+ lbu t0, 0(t6)
+ sb t0, 0(t5)
+ sb t0, 1(t5)
+ addiu t4, -2
+ addiu t6, 1
+ bgtz t4, 2b
+ addiu t5, 2
+3:
+ lw t6, 0(t7) /* t6 = outptr[0] */
+ lw t5, 4(t7) /* t5 = outptr[1] */
+ addu t4, t6, a1 /* t4 = new end address */
+ beq a1, t9, 5f
+ subu t8, t4, t9
+4:
+ ulw t0, 0(t6)
+ ulw t1, 4(t6)
+ ulw t2, 8(t6)
+ usw t0, 0(t5)
+ ulw t0, 12(t6)
+ usw t1, 4(t5)
+ usw t2, 8(t5)
+ usw t0, 12(t5)
+ addiu t6, 16
+ bne t6, t8, 4b
+ addiu t5, 16
+ beqz t9, 6f
+ nop
+5:
+ lbu t0, 0(t6)
+ sb t0, 0(t5)
+ addiu t6, 1
+ bne t6, t4, 5b
+ addiu t5, 1
+6:
+ addiu t7, 8
+ addiu a0, -2
+ bgtz a0, 0b
+ addiu a2, 4
+7:
+ j ra
+ nop
+END(jsimd_h2v2_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_islow_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = compptr->dcttable
+ * a2 = output
+ * a3 = range_limit
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ addiu sp, sp, -256
+ move v0, sp
+ addiu v1, zero, 8 /* v1 = DCTSIZE = 8 */
+1:
+ lh s4, 32(a0) /* s4 = inptr[16] */
+ lh s5, 64(a0) /* s5 = inptr[32] */
+ lh s6, 96(a0) /* s6 = inptr[48] */
+ lh t1, 112(a0) /* t1 = inptr[56] */
+ lh t7, 16(a0) /* t7 = inptr[8] */
+ lh t5, 80(a0) /* t5 = inptr[40] */
+ lh t3, 48(a0) /* t3 = inptr[24] */
+ or s4, s4, t1
+ or s4, s4, t3
+ or s4, s4, t5
+ or s4, s4, t7
+ or s4, s4, s5
+ or s4, s4, s6
+ bnez s4, 2f
+ addiu v1, v1, -1
+ lh s5, 0(a1) /* quantptr[DCTSIZE*0] */
+ lh s6, 0(a0) /* inptr[DCTSIZE*0] */
+ mul s5, s5, s6 /* DEQUANTIZE(inptr[0], quantptr[0]) */
+ sll s5, s5, 2
+ sw s5, 0(v0)
+ sw s5, 32(v0)
+ sw s5, 64(v0)
+ sw s5, 96(v0)
+ sw s5, 128(v0)
+ sw s5, 160(v0)
+ sw s5, 192(v0)
+ b 3f
+ sw s5, 224(v0)
+2:
+ lh t0, 112(a1)
+ lh t2, 48(a1)
+ lh t4, 80(a1)
+ lh t6, 16(a1)
+ mul t0, t0, t1 /* DEQUANTIZE(inptr[DCTSIZE*7],
+ quantptr[DCTSIZE*7]) */
+ mul t1, t2, t3 /* DEQUANTIZE(inptr[DCTSIZE*3],
+ quantptr[DCTSIZE*3]) */
+ mul t2, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*5],
+ quantptr[DCTSIZE*5]) */
+ mul t3, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*1],
+ quantptr[DCTSIZE*1]) */
+ lh t4, 32(a1)
+ lh t5, 32(a0)
+ lh t6, 96(a1)
+ lh t7, 96(a0)
+ addu s0, t0, t1 /* z3 = tmp0 + tmp2 */
+ addu s1, t1, t2 /* z2 = tmp1 + tmp2 */
+ addu s2, t2, t3 /* z4 = tmp1 + tmp3 */
+ addu s3, s0, s2 /* z3 + z4 */
+ addiu t9, zero, 9633 /* FIX_1_175875602 */
+ mul s3, s3, t9 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ addu t8, t0, t3 /* z1 = tmp0 + tmp3 */
+ addiu t9, zero, 2446 /* FIX_0_298631336 */
+ mul t0, t0, t9 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ addiu t9, zero, 16819 /* FIX_2_053119869 */
+ mul t2, t2, t9 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ addiu t9, zero, 25172 /* FIX_3_072711026 */
+ mul t1, t1, t9 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ addiu t9, zero, 12299 /* FIX_1_501321110 */
+ mul t3, t3, t9 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ addiu t9, zero, 16069 /* FIX_1_961570560 */
+ mul s0, s0, t9 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
+ addiu t9, zero, 3196 /* FIX_0_390180644 */
+ mul s2, s2, t9 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
+ addiu t9, zero, 7373 /* FIX_0_899976223 */
+ mul t8, t8, t9 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
+ addiu t9, zero, 20995 /* FIX_2_562915447 */
+ mul s1, s1, t9 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
+ subu s0, s3, s0 /* z3 += z5 */
+ addu t0, t0, s0 /* tmp0 += z3 */
+ addu t1, t1, s0 /* tmp2 += z3 */
+ subu s2, s3, s2 /* z4 += z5 */
+ addu t2, t2, s2 /* tmp1 += z4 */
+ addu t3, t3, s2 /* tmp3 += z4 */
+ subu t0, t0, t8 /* tmp0 += z1 */
+ subu t1, t1, s1 /* tmp2 += z2 */
+ subu t2, t2, s1 /* tmp1 += z2 */
+ subu t3, t3, t8 /* tmp3 += z1 */
+ mul s0, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*2],
+ quantptr[DCTSIZE*2]) */
+ addiu t9, zero, 6270 /* FIX_0_765366865 */
+ mul s1, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*6],
+ quantptr[DCTSIZE*6]) */
+ lh t4, 0(a1)
+ lh t5, 0(a0)
+ lh t6, 64(a1)
+ lh t7, 64(a0)
+ mul s2, t9, s0 /* MULTIPLY(z2, FIX_0_765366865) */
+ mul t5, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*0],
+ quantptr[DCTSIZE*0]) */
+ mul t6, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*4],
+ quantptr[DCTSIZE*4]) */
+ addiu t9, zero, 4433 /* FIX_0_541196100 */
+ addu s3, s0, s1 /* z2 + z3 */
+ mul s3, s3, t9 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
+ addiu t9, zero, 15137 /* FIX_1_847759065 */
+ mul t8, s1, t9 /* MULTIPLY(z3, FIX_1_847759065) */
+ addu t4, t5, t6
+ subu t5, t5, t6
+ sll t4, t4, 13 /* tmp0 = (z2 + z3) << CONST_BITS */
+ sll t5, t5, 13 /* tmp1 = (z2 - z3) << CONST_BITS */
+ addu t7, s3, s2 /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */
+ subu t6, s3, t8 /* tmp2 =
+ z1 + MULTIPLY(z3, -FIX_1_847759065) */
+ addu s0, t4, t7
+ subu s1, t4, t7
+ addu s2, t5, t6
+ subu s3, t5, t6
+ addu t4, s0, t3
+ subu s0, s0, t3
+ addu t3, s2, t1
+ subu s2, s2, t1
+ addu t1, s3, t2
+ subu s3, s3, t2
+ addu t2, s1, t0
+ subu s1, s1, t0
+ shra_r.w t4, t4, 11
+ shra_r.w t3, t3, 11
+ shra_r.w t1, t1, 11
+ shra_r.w t2, t2, 11
+ shra_r.w s1, s1, 11
+ shra_r.w s3, s3, 11
+ shra_r.w s2, s2, 11
+ shra_r.w s0, s0, 11
+ sw t4, 0(v0)
+ sw t3, 32(v0)
+ sw t1, 64(v0)
+ sw t2, 96(v0)
+ sw s1, 128(v0)
+ sw s3, 160(v0)
+ sw s2, 192(v0)
+ sw s0, 224(v0)
+3:
+ addiu a1, a1, 2
+ addiu a0, a0, 2
+ bgtz v1, 1b
+ addiu v0, v0, 4
+ move v0, sp
+ addiu v1, zero, 8
+4:
+ lw t0, 8(v0) /* z2 = (JLONG)wsptr[2] */
+ lw t1, 24(v0) /* z3 = (JLONG)wsptr[6] */
+ lw t2, 0(v0) /* (JLONG)wsptr[0] */
+ lw t3, 16(v0) /* (JLONG)wsptr[4] */
+ lw s4, 4(v0) /* (JLONG)wsptr[1] */
+ lw s5, 12(v0) /* (JLONG)wsptr[3] */
+ lw s6, 20(v0) /* (JLONG)wsptr[5] */
+ lw s7, 28(v0) /* (JLONG)wsptr[7] */
+ or s4, s4, t0
+ or s4, s4, t1
+ or s4, s4, t3
+ or s4, s4, s7
+ or s4, s4, s5
+ or s4, s4, s6
+ bnez s4, 5f
+ addiu v1, v1, -1
+ shra_r.w s5, t2, 5
+ andi s5, s5, 0x3ff
+ lbux s5, s5(a3)
+ lw s1, 0(a2)
+ replv.qb s5, s5
+ usw s5, 0(s1)
+ usw s5, 4(s1)
+ b 6f
+ nop
+5:
+ addu t4, t0, t1 /* z2 + z3 */
+ addiu t8, zero, 4433 /* FIX_0_541196100 */
+ mul t5, t4, t8 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
+ addiu t8, zero, 15137 /* FIX_1_847759065 */
+ mul t1, t1, t8 /* MULTIPLY(z3, FIX_1_847759065) */
+ addiu t8, zero, 6270 /* FIX_0_765366865 */
+ mul t0, t0, t8 /* MULTIPLY(z2, FIX_0_765366865) */
+ addu t4, t2, t3 /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */
+ subu t2, t2, t3 /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */
+ sll t4, t4, 13 /* tmp0 =
+ (wsptr[0] + wsptr[4]) << CONST_BITS */
+ sll t2, t2, 13 /* tmp1 =
+ (wsptr[0] - wsptr[4]) << CONST_BITS */
+ subu t1, t5, t1 /* tmp2 =
+ z1 + MULTIPLY(z3, -FIX_1_847759065) */
+ subu t3, t2, t1 /* tmp12 = tmp1 - tmp2 */
+ addu t2, t2, t1 /* tmp11 = tmp1 + tmp2 */
+ addu t5, t5, t0 /* tmp3 =
+ z1 + MULTIPLY(z2, FIX_0_765366865) */
+ subu t1, t4, t5 /* tmp13 = tmp0 - tmp3 */
+ addu t0, t4, t5 /* tmp10 = tmp0 + tmp3 */
+ lw t4, 28(v0) /* tmp0 = (JLONG)wsptr[7] */
+ lw t6, 12(v0) /* tmp2 = (JLONG)wsptr[3] */
+ lw t5, 20(v0) /* tmp1 = (JLONG)wsptr[5] */
+ lw t7, 4(v0) /* tmp3 = (JLONG)wsptr[1] */
+ addu s0, t4, t6 /* z3 = tmp0 + tmp2 */
+ addiu t8, zero, 9633 /* FIX_1_175875602 */
+ addu s1, t5, t7 /* z4 = tmp1 + tmp3 */
+ addu s2, s0, s1 /* z3 + z4 */
+ mul s2, s2, t8 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ addu s3, t4, t7 /* z1 = tmp0 + tmp3 */
+ addu t9, t5, t6 /* z2 = tmp1 + tmp2 */
+ addiu t8, zero, 16069 /* FIX_1_961570560 */
+ mul s0, s0, t8 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
+ addiu t8, zero, 3196 /* FIX_0_390180644 */
+ mul s1, s1, t8 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
+ addiu t8, zero, 2446 /* FIX_0_298631336 */
+ mul t4, t4, t8 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ addiu t8, zero, 7373 /* FIX_0_899976223 */
+ mul s3, s3, t8 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
+ addiu t8, zero, 16819 /* FIX_2_053119869 */
+ mul t5, t5, t8 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ addiu t8, zero, 20995 /* FIX_2_562915447 */
+ mul t9, t9, t8 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
+ addiu t8, zero, 25172 /* FIX_3_072711026 */
+ mul t6, t6, t8 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ addiu t8, zero, 12299 /* FIX_1_501321110 */
+ mul t7, t7, t8 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ subu s0, s2, s0 /* z3 += z5 */
+ subu s1, s2, s1 /* z4 += z5 */
+ addu t4, t4, s0
+ subu t4, t4, s3 /* tmp0 */
+ addu t5, t5, s1
+ subu t5, t5, t9 /* tmp1 */
+ addu t6, t6, s0
+ subu t6, t6, t9 /* tmp2 */
+ addu t7, t7, s1
+ subu t7, t7, s3 /* tmp3 */
+ addu s0, t0, t7
+ subu t0, t0, t7
+ addu t7, t2, t6
+ subu t2, t2, t6
+ addu t6, t3, t5
+ subu t3, t3, t5
+ addu t5, t1, t4
+ subu t1, t1, t4
+ shra_r.w s0, s0, 18
+ shra_r.w t7, t7, 18
+ shra_r.w t6, t6, 18
+ shra_r.w t5, t5, 18
+ shra_r.w t1, t1, 18
+ shra_r.w t3, t3, 18
+ shra_r.w t2, t2, 18
+ shra_r.w t0, t0, 18
+ andi s0, s0, 0x3ff
+ andi t7, t7, 0x3ff
+ andi t6, t6, 0x3ff
+ andi t5, t5, 0x3ff
+ andi t1, t1, 0x3ff
+ andi t3, t3, 0x3ff
+ andi t2, t2, 0x3ff
+ andi t0, t0, 0x3ff
+ lw s1, 0(a2)
+ lbux s0, s0(a3)
+ lbux t7, t7(a3)
+ lbux t6, t6(a3)
+ lbux t5, t5(a3)
+ lbux t1, t1(a3)
+ lbux t3, t3(a3)
+ lbux t2, t2(a3)
+ lbux t0, t0(a3)
+ sb s0, 0(s1)
+ sb t7, 1(s1)
+ sb t6, 2(s1)
+ sb t5, 3(s1)
+ sb t1, 4(s1)
+ sb t3, 5(s1)
+ sb t2, 6(s1)
+ sb t0, 7(s1)
+6:
+ addiu v0, v0, 32
+ bgtz v1, 4b
+ addiu a2, a2, 4
+ addiu sp, sp, 256
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_idct_islow_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
+/*
+ * a0 = inptr
+ * a1 = quantptr
+ * a2 = wsptr
+ * a3 = mips_idct_ifast_coefs
+ */
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ addiu t9, a0, 16 /* end address */
+ or AT, a3, zero
+
+0:
+ lw s0, 0(a1) /* quantptr[DCTSIZE*0] */
+ lw t0, 0(a0) /* inptr[DCTSIZE*0] */
+ lw t1, 16(a0) /* inptr[DCTSIZE*1] */
+ muleq_s.w.phl v0, t0, s0 /* tmp0 ... */
+ lw t2, 32(a0) /* inptr[DCTSIZE*2] */
+ lw t3, 48(a0) /* inptr[DCTSIZE*3] */
+ lw t4, 64(a0) /* inptr[DCTSIZE*4] */
+ lw t5, 80(a0) /* inptr[DCTSIZE*5] */
+ muleq_s.w.phr t0, t0, s0 /* ... tmp0 ... */
+ lw t6, 96(a0) /* inptr[DCTSIZE*6] */
+ lw t7, 112(a0) /* inptr[DCTSIZE*7] */
+ or s4, t1, t2
+ or s5, t3, t4
+ bnez s4, 1f
+ ins t0, v0, 16, 16 /* ... tmp0 */
+ bnez s5, 1f
+ or s6, t5, t6
+ or s6, s6, t7
+ bnez s6, 1f
+ sw t0, 0(a2) /* wsptr[DCTSIZE*0] */
+ sw t0, 16(a2) /* wsptr[DCTSIZE*1] */
+ sw t0, 32(a2) /* wsptr[DCTSIZE*2] */
+ sw t0, 48(a2) /* wsptr[DCTSIZE*3] */
+ sw t0, 64(a2) /* wsptr[DCTSIZE*4] */
+ sw t0, 80(a2) /* wsptr[DCTSIZE*5] */
+ sw t0, 96(a2) /* wsptr[DCTSIZE*6] */
+ sw t0, 112(a2) /* wsptr[DCTSIZE*7] */
+ addiu a0, a0, 4
+ b 2f
+ addiu a1, a1, 4
+
+1:
+ lw s1, 32(a1) /* quantptr[DCTSIZE*2] */
+ lw s2, 64(a1) /* quantptr[DCTSIZE*4] */
+ muleq_s.w.phl v0, t2, s1 /* tmp1 ... */
+ muleq_s.w.phr t2, t2, s1 /* ... tmp1 ... */
+ lw s0, 16(a1) /* quantptr[DCTSIZE*1] */
+ lw s1, 48(a1) /* quantptr[DCTSIZE*3] */
+ lw s3, 96(a1) /* quantptr[DCTSIZE*6] */
+ muleq_s.w.phl v1, t4, s2 /* tmp2 ... */
+ muleq_s.w.phr t4, t4, s2 /* ... tmp2 ... */
+ lw s2, 80(a1) /* quantptr[DCTSIZE*5] */
+ lw t8, 4(AT) /* FIX(1.414213562) */
+ ins t2, v0, 16, 16 /* ... tmp1 */
+ muleq_s.w.phl v0, t6, s3 /* tmp3 ... */
+ muleq_s.w.phr t6, t6, s3 /* ... tmp3 ... */
+ ins t4, v1, 16, 16 /* ... tmp2 */
+ addq.ph s4, t0, t4 /* tmp10 */
+ subq.ph s5, t0, t4 /* tmp11 */
+ ins t6, v0, 16, 16 /* ... tmp3 */
+ subq.ph s6, t2, t6 /* tmp12 ... */
+ addq.ph s7, t2, t6 /* tmp13 */
+ mulq_s.ph s6, s6, t8 /* ... tmp12 ... */
+ addq.ph t0, s4, s7 /* tmp0 */
+ subq.ph t6, s4, s7 /* tmp3 */
+ muleq_s.w.phl v0, t1, s0 /* tmp4 ... */
+ muleq_s.w.phr t1, t1, s0 /* ... tmp4 ... */
+ shll_s.ph s6, s6, 1 /* x2 */
+ lw s3, 112(a1) /* quantptr[DCTSIZE*7] */
+ subq.ph s6, s6, s7 /* ... tmp12 */
+ muleq_s.w.phl v1, t7, s3 /* tmp7 ... */
+ muleq_s.w.phr t7, t7, s3 /* ... tmp7 ... */
+ ins t1, v0, 16, 16 /* ... tmp4 */
+ addq.ph t2, s5, s6 /* tmp1 */
+ subq.ph t4, s5, s6 /* tmp2 */
+ muleq_s.w.phl v0, t5, s2 /* tmp6 ... */
+ muleq_s.w.phr t5, t5, s2 /* ... tmp6 ... */
+ ins t7, v1, 16, 16 /* ... tmp7 */
+ addq.ph s5, t1, t7 /* z11 */
+ subq.ph s6, t1, t7 /* z12 */
+ muleq_s.w.phl v1, t3, s1 /* tmp5 ... */
+ muleq_s.w.phr t3, t3, s1 /* ... tmp5 ... */
+ ins t5, v0, 16, 16 /* ... tmp6 */
+ ins t3, v1, 16, 16 /* ... tmp5 */
+ addq.ph s7, t5, t3 /* z13 */
+ subq.ph v0, t5, t3 /* z10 */
+ addq.ph t7, s5, s7 /* tmp7 */
+ subq.ph s5, s5, s7 /* tmp11 ... */
+ addq.ph v1, v0, s6 /* z5 ... */
+ mulq_s.ph s5, s5, t8 /* ... tmp11 */
+ lw t8, 8(AT) /* FIX(1.847759065) */
+ lw s4, 0(AT) /* FIX(1.082392200) */
+ addq.ph s0, t0, t7
+ subq.ph s1, t0, t7
+ mulq_s.ph v1, v1, t8 /* ... z5 */
+ shll_s.ph s5, s5, 1 /* x2 */
+ lw t8, 12(AT) /* FIX(-2.613125930) */
+ sw s0, 0(a2) /* wsptr[DCTSIZE*0] */
+ shll_s.ph v0, v0, 1 /* x4 */
+ mulq_s.ph v0, v0, t8 /* tmp12 ... */
+ mulq_s.ph s4, s6, s4 /* tmp10 ... */
+ shll_s.ph v1, v1, 1 /* x2 */
+ addiu a0, a0, 4
+ addiu a1, a1, 4
+ sw s1, 112(a2) /* wsptr[DCTSIZE*7] */
+ shll_s.ph s6, v0, 1 /* x4 */
+ shll_s.ph s4, s4, 1 /* x2 */
+ addq.ph s6, s6, v1 /* ... tmp12 */
+ subq.ph t5, s6, t7 /* tmp6 */
+ subq.ph s4, s4, v1 /* ... tmp10 */
+ subq.ph t3, s5, t5 /* tmp5 */
+ addq.ph s2, t2, t5
+ addq.ph t1, s4, t3 /* tmp4 */
+ subq.ph s3, t2, t5
+ sw s2, 16(a2) /* wsptr[DCTSIZE*1] */
+ sw s3, 96(a2) /* wsptr[DCTSIZE*6] */
+ addq.ph v0, t4, t3
+ subq.ph v1, t4, t3
+ sw v0, 32(a2) /* wsptr[DCTSIZE*2] */
+ sw v1, 80(a2) /* wsptr[DCTSIZE*5] */
+ addq.ph v0, t6, t1
+ subq.ph v1, t6, t1
+ sw v0, 64(a2) /* wsptr[DCTSIZE*4] */
+ sw v1, 48(a2) /* wsptr[DCTSIZE*3] */
+
+2:
+ bne a0, t9, 0b
+ addiu a2, a2, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_idct_ifast_cols_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
+/*
+ * a0 = wsptr
+ * a1 = output_buf
+ * a2 = output_col
+ * a3 = mips_idct_ifast_coefs
+ */
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+ addiu t9, a0, 128 /* end address */
+ lui s8, 0x8080
+ ori s8, s8, 0x8080
+
+0:
+ lw AT, 36(sp) /* restore $a3 (mips_idct_ifast_coefs) */
+ lw t0, 0(a0) /* wsptr[DCTSIZE*0+0/1] b a */
+ lw s0, 16(a0) /* wsptr[DCTSIZE*1+0/1] B A */
+ lw t2, 4(a0) /* wsptr[DCTSIZE*0+2/3] d c */
+ lw s2, 20(a0) /* wsptr[DCTSIZE*1+2/3] D C */
+ lw t4, 8(a0) /* wsptr[DCTSIZE*0+4/5] f e */
+ lw s4, 24(a0) /* wsptr[DCTSIZE*1+4/5] F E */
+ lw t6, 12(a0) /* wsptr[DCTSIZE*0+6/7] h g */
+ lw s6, 28(a0) /* wsptr[DCTSIZE*1+6/7] H G */
+ precrq.ph.w t1, s0, t0 /* B b */
+ ins t0, s0, 16, 16 /* A a */
+ bnez t1, 1f
+ or s0, t2, s2
+ bnez s0, 1f
+ or s0, t4, s4
+ bnez s0, 1f
+ or s0, t6, s6
+ bnez s0, 1f
+ shll_s.ph s0, t0, 2 /* A a */
+ lw a3, 0(a1)
+ lw AT, 4(a1)
+ precrq.ph.w t0, s0, s0 /* A A */
+ ins s0, s0, 16, 16 /* a a */
+ addu a3, a3, a2
+ addu AT, AT, a2
+ precrq.qb.ph t0, t0, t0 /* A A A A */
+ precrq.qb.ph s0, s0, s0 /* a a a a */
+ addu.qb s0, s0, s8
+ addu.qb t0, t0, s8
+ sw s0, 0(a3)
+ sw s0, 4(a3)
+ sw t0, 0(AT)
+ sw t0, 4(AT)
+ addiu a0, a0, 32
+ bne a0, t9, 0b
+ addiu a1, a1, 8
+ b 2f
+ nop
+
+1:
+ precrq.ph.w t3, s2, t2
+ ins t2, s2, 16, 16
+ precrq.ph.w t5, s4, t4
+ ins t4, s4, 16, 16
+ precrq.ph.w t7, s6, t6
+ ins t6, s6, 16, 16
+ lw t8, 4(AT) /* FIX(1.414213562) */
+ addq.ph s4, t0, t4 /* tmp10 */
+ subq.ph s5, t0, t4 /* tmp11 */
+ subq.ph s6, t2, t6 /* tmp12 ... */
+ addq.ph s7, t2, t6 /* tmp13 */
+ mulq_s.ph s6, s6, t8 /* ... tmp12 ... */
+ addq.ph t0, s4, s7 /* tmp0 */
+ subq.ph t6, s4, s7 /* tmp3 */
+ shll_s.ph s6, s6, 1 /* x2 */
+ subq.ph s6, s6, s7 /* ... tmp12 */
+ addq.ph t2, s5, s6 /* tmp1 */
+ subq.ph t4, s5, s6 /* tmp2 */
+ addq.ph s5, t1, t7 /* z11 */
+ subq.ph s6, t1, t7 /* z12 */
+ addq.ph s7, t5, t3 /* z13 */
+ subq.ph v0, t5, t3 /* z10 */
+ addq.ph t7, s5, s7 /* tmp7 */
+ subq.ph s5, s5, s7 /* tmp11 ... */
+ addq.ph v1, v0, s6 /* z5 ... */
+ mulq_s.ph s5, s5, t8 /* ... tmp11 */
+ lw t8, 8(AT) /* FIX(1.847759065) */
+ lw s4, 0(AT) /* FIX(1.082392200) */
+ addq.ph s0, t0, t7 /* tmp0 + tmp7 */
+ subq.ph s7, t0, t7 /* tmp0 - tmp7 */
+ mulq_s.ph v1, v1, t8 /* ... z5 */
+ lw a3, 0(a1)
+ lw t8, 12(AT) /* FIX(-2.613125930) */
+ shll_s.ph s5, s5, 1 /* x2 */
+ addu a3, a3, a2
+ shll_s.ph v0, v0, 1 /* x4 */
+ mulq_s.ph v0, v0, t8 /* tmp12 ... */
+ mulq_s.ph s4, s6, s4 /* tmp10 ... */
+ shll_s.ph v1, v1, 1 /* x2 */
+ addiu a0, a0, 32
+ addiu a1, a1, 8
+ shll_s.ph s6, v0, 1 /* x4 */
+ shll_s.ph s4, s4, 1 /* x2 */
+ addq.ph s6, s6, v1 /* ... tmp12 */
+ shll_s.ph s0, s0, 2
+ subq.ph t5, s6, t7 /* tmp6 */
+ subq.ph s4, s4, v1 /* ... tmp10 */
+ subq.ph t3, s5, t5 /* tmp5 */
+ shll_s.ph s7, s7, 2
+ addq.ph t1, s4, t3 /* tmp4 */
+ addq.ph s1, t2, t5 /* tmp1 + tmp6 */
+ subq.ph s6, t2, t5 /* tmp1 - tmp6 */
+ addq.ph s2, t4, t3 /* tmp2 + tmp5 */
+ subq.ph s5, t4, t3 /* tmp2 - tmp5 */
+ addq.ph s4, t6, t1 /* tmp3 + tmp4 */
+ subq.ph s3, t6, t1 /* tmp3 - tmp4 */
+ shll_s.ph s1, s1, 2
+ shll_s.ph s2, s2, 2
+ shll_s.ph s3, s3, 2
+ shll_s.ph s4, s4, 2
+ shll_s.ph s5, s5, 2
+ shll_s.ph s6, s6, 2
+ precrq.ph.w t0, s1, s0 /* B A */
+ ins s0, s1, 16, 16 /* b a */
+ precrq.ph.w t2, s3, s2 /* D C */
+ ins s2, s3, 16, 16 /* d c */
+ precrq.ph.w t4, s5, s4 /* F E */
+ ins s4, s5, 16, 16 /* f e */
+ precrq.ph.w t6, s7, s6 /* H G */
+ ins s6, s7, 16, 16 /* h g */
+ precrq.qb.ph t0, t2, t0 /* D C B A */
+ precrq.qb.ph s0, s2, s0 /* d c b a */
+ precrq.qb.ph t4, t6, t4 /* H G F E */
+ precrq.qb.ph s4, s6, s4 /* h g f e */
+ addu.qb s0, s0, s8
+ addu.qb s4, s4, s8
+ sw s0, 0(a3) /* outptr[0/1/2/3] d c b a */
+ sw s4, 4(a3) /* outptr[4/5/6/7] h g f e */
+ lw a3, -4(a1)
+ addu.qb t0, t0, s8
+ addu a3, a3, a2
+ addu.qb t4, t4, s8
+ sw t0, 0(a3) /* outptr[0/1/2/3] D C B A */
+ bne a0, t9, 0b
+ sw t4, 4(a3) /* outptr[4/5/6/7] H G F E */
+
+2:
+
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+ j ra
+ nop
+
+END(jsimd_idct_ifast_rows_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_fdct_islow_dspr2)
+/*
+ * a0 = data
+ */
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+ lui t0, 6437
+ ori t0, 2260
+ lui t1, 9633
+ ori t1, 11363
+ lui t2, 0xd39e
+ ori t2, 0xe6dc
+ lui t3, 0xf72d
+ ori t3, 9633
+ lui t4, 2261
+ ori t4, 9633
+ lui t5, 0xd39e
+ ori t5, 6437
+ lui t6, 9633
+ ori t6, 0xd39d
+ lui t7, 0xe6dc
+ ori t7, 2260
+ lui t8, 4433
+ ori t8, 10703
+ lui t9, 0xd630
+ ori t9, 4433
+ li s8, 8
+ move a1, a0
+1:
+ lw s0, 0(a1) /* tmp0 = 1|0 */
+ lw s1, 4(a1) /* tmp1 = 3|2 */
+ lw s2, 8(a1) /* tmp2 = 5|4 */
+ lw s3, 12(a1) /* tmp3 = 7|6 */
+ packrl.ph s1, s1, s1 /* tmp1 = 2|3 */
+ packrl.ph s3, s3, s3 /* tmp3 = 6|7 */
+ subq.ph s7, s1, s2 /* tmp7 = 2-5|3-4 = t5|t4 */
+ subq.ph s5, s0, s3 /* tmp5 = 1-6|0-7 = t6|t7 */
+ mult $0, $0 /* ac0 = 0 */
+ dpa.w.ph $ac0, s7, t0 /* ac0 += t5* 6437 + t4* 2260 */
+ dpa.w.ph $ac0, s5, t1 /* ac0 += t6* 9633 + t7* 11363 */
+ mult $ac1, $0, $0 /* ac1 = 0 */
+ dpa.w.ph $ac1, s7, t2 /* ac1 += t5*-11362 + t4* -6436 */
+ dpa.w.ph $ac1, s5, t3 /* ac1 += t6* -2259 + t7* 9633 */
+ mult $ac2, $0, $0 /* ac2 = 0 */
+ dpa.w.ph $ac2, s7, t4 /* ac2 += t5* 2261 + t4* 9633 */
+ dpa.w.ph $ac2, s5, t5 /* ac2 += t6*-11362 + t7* 6437 */
+ mult $ac3, $0, $0 /* ac3 = 0 */
+ dpa.w.ph $ac3, s7, t6 /* ac3 += t5* 9633 + t4*-11363 */
+ dpa.w.ph $ac3, s5, t7 /* ac3 += t6* -6436 + t7* 2260 */
+ addq.ph s6, s1, s2 /* tmp6 = 2+5|3+4 = t2|t3 */
+ addq.ph s4, s0, s3 /* tmp4 = 1+6|0+7 = t1|t0 */
+ extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */
+ extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */
+ extr_r.w s2, $ac2, 11 /* tmp2 = (ac2 + 1024) >> 11 */
+ extr_r.w s3, $ac3, 11 /* tmp3 = (ac3 + 1024) >> 11 */
+ addq.ph s5, s4, s6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */
+ subq.ph s7, s4, s6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */
+ sh s0, 2(a1)
+ sh s1, 6(a1)
+ sh s2, 10(a1)
+ sh s3, 14(a1)
+ mult $0, $0 /* ac0 = 0 */
+ dpa.w.ph $ac0, s7, t8 /* ac0 += t12* 4433 + t13* 10703 */
+ mult $ac1, $0, $0 /* ac1 = 0 */
+ dpa.w.ph $ac1, s7, t9 /* ac1 += t12*-10704 + t13* 4433 */
+ sra s4, s5, 16 /* tmp4 = t11 */
+ addiu a1, a1, 16
+ addiu s8, s8, -1
+ extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */
+ extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */
+ addu s2, s5, s4 /* tmp2 = t10 + t11 */
+ subu s3, s5, s4 /* tmp3 = t10 - t11 */
+ sll s2, s2, 2 /* tmp2 = (t10 + t11) << 2 */
+ sll s3, s3, 2 /* tmp3 = (t10 - t11) << 2 */
+ sh s2, -16(a1)
+ sh s3, -8(a1)
+ sh s0, -12(a1)
+ bgtz s8, 1b
+ sh s1, -4(a1)
+ li t0, 2260
+ li t1, 11363
+ li t2, 9633
+ li t3, 6436
+ li t4, 6437
+ li t5, 2261
+ li t6, 11362
+ li t7, 2259
+ li t8, 4433
+ li t9, 10703
+ li a1, 10704
+ li s8, 8
+
+2:
+ lh a2, 0(a0) /* 0 */
+ lh a3, 16(a0) /* 8 */
+ lh v0, 32(a0) /* 16 */
+ lh v1, 48(a0) /* 24 */
+ lh s4, 64(a0) /* 32 */
+ lh s5, 80(a0) /* 40 */
+ lh s6, 96(a0) /* 48 */
+ lh s7, 112(a0) /* 56 */
+ addu s2, v0, s5 /* tmp2 = 16 + 40 */
+ subu s5, v0, s5 /* tmp5 = 16 - 40 */
+ addu s3, v1, s4 /* tmp3 = 24 + 32 */
+ subu s4, v1, s4 /* tmp4 = 24 - 32 */
+ addu s0, a2, s7 /* tmp0 = 0 + 56 */
+ subu s7, a2, s7 /* tmp7 = 0 - 56 */
+ addu s1, a3, s6 /* tmp1 = 8 + 48 */
+ subu s6, a3, s6 /* tmp6 = 8 - 48 */
+ addu a2, s0, s3 /* tmp10 = tmp0 + tmp3 */
+ subu v1, s0, s3 /* tmp13 = tmp0 - tmp3 */
+ addu a3, s1, s2 /* tmp11 = tmp1 + tmp2 */
+ subu v0, s1, s2 /* tmp12 = tmp1 - tmp2 */
+ mult s7, t1 /* ac0 = tmp7 * c1 */
+ madd s4, t0 /* ac0 += tmp4 * c0 */
+ madd s5, t4 /* ac0 += tmp5 * c4 */
+ madd s6, t2 /* ac0 += tmp6 * c2 */
+ mult $ac1, s7, t2 /* ac1 = tmp7 * c2 */
+ msub $ac1, s4, t3 /* ac1 -= tmp4 * c3 */
+ msub $ac1, s5, t6 /* ac1 -= tmp5 * c6 */
+ msub $ac1, s6, t7 /* ac1 -= tmp6 * c7 */
+ mult $ac2, s7, t4 /* ac2 = tmp7 * c4 */
+ madd $ac2, s4, t2 /* ac2 += tmp4 * c2 */
+ madd $ac2, s5, t5 /* ac2 += tmp5 * c5 */
+ msub $ac2, s6, t6 /* ac2 -= tmp6 * c6 */
+ mult $ac3, s7, t0 /* ac3 = tmp7 * c0 */
+ msub $ac3, s4, t1 /* ac3 -= tmp4 * c1 */
+ madd $ac3, s5, t2 /* ac3 += tmp5 * c2 */
+ msub $ac3, s6, t3 /* ac3 -= tmp6 * c3 */
+ extr_r.w s0, $ac0, 15 /* tmp0 = (ac0 + 16384) >> 15 */
+ extr_r.w s1, $ac1, 15 /* tmp1 = (ac1 + 16384) >> 15 */
+ extr_r.w s2, $ac2, 15 /* tmp2 = (ac2 + 16384) >> 15 */
+ extr_r.w s3, $ac3, 15 /* tmp3 = (ac3 + 16384) >> 15 */
+ addiu s8, s8, -1
+ addu s4, a2, a3 /* tmp4 = tmp10 + tmp11 */
+ subu s5, a2, a3 /* tmp5 = tmp10 - tmp11 */
+ sh s0, 16(a0)
+ sh s1, 48(a0)
+ sh s2, 80(a0)
+ sh s3, 112(a0)
+ mult v0, t8 /* ac0 = tmp12 * c8 */
+ madd v1, t9 /* ac0 += tmp13 * c9 */
+ mult $ac1, v1, t8 /* ac1 = tmp13 * c8 */
+ msub $ac1, v0, a1 /* ac1 -= tmp12 * c10 */
+ addiu a0, a0, 2
+ extr_r.w s6, $ac0, 15 /* tmp6 = (ac0 + 16384) >> 15 */
+ extr_r.w s7, $ac1, 15 /* tmp7 = (ac1 + 16384) >> 15 */
+ shra_r.w s4, s4, 2 /* tmp4 = (tmp4 + 2) >> 2 */
+ shra_r.w s5, s5, 2 /* tmp5 = (tmp5 + 2) >> 2 */
+ sh s4, -2(a0)
+ sh s5, 62(a0)
+ sh s6, 30(a0)
+ bgtz s8, 2b
+ sh s7, 94(a0)
+
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+ jr ra
+ nop
+
+END(jsimd_fdct_islow_dspr2)
+
+
+/**************************************************************************/
+LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
+/*
+ * a0 = data
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 8, s0, s1
+
+ li a1, 0x014e014e /* FIX_1_306562965 (334 << 16) |
+ (334 & 0xffff) */
+ li a2, 0x008b008b /* FIX_0_541196100 (139 << 16) |
+ (139 & 0xffff) */
+ li a3, 0x00620062 /* FIX_0_382683433 (98 << 16) |
+ (98 & 0xffff) */
+ li s1, 0x00b500b5 /* FIX_0_707106781 (181 << 16) |
+ (181 & 0xffff) */
+
+ move v0, a0
+ addiu v1, v0, 128 /* end address */
+
+0:
+ lw t0, 0(v0) /* tmp0 = 1|0 */
+ lw t1, 4(v0) /* tmp1 = 3|2 */
+ lw t2, 8(v0) /* tmp2 = 5|4 */
+ lw t3, 12(v0) /* tmp3 = 7|6 */
+ packrl.ph t1, t1, t1 /* tmp1 = 2|3 */
+ packrl.ph t3, t3, t3 /* tmp3 = 6|7 */
+ subq.ph t7, t1, t2 /* tmp7 = 2-5|3-4 = t5|t4 */
+ subq.ph t5, t0, t3 /* tmp5 = 1-6|0-7 = t6|t7 */
+ addq.ph t6, t1, t2 /* tmp6 = 2+5|3+4 = t2|t3 */
+ addq.ph t4, t0, t3 /* tmp4 = 1+6|0+7 = t1|t0 */
+ addq.ph t8, t4, t6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */
+ subq.ph t9, t4, t6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */
+ sra t4, t8, 16 /* tmp4 = t11 */
+ mult $0, $0 /* ac0 = 0 */
+ dpa.w.ph $ac0, t9, s1
+ mult $ac1, $0, $0 /* ac1 = 0 */
+ dpa.w.ph $ac1, t7, a3 /* ac1 += t4*98 + t5*98 */
+ dpsx.w.ph $ac1, t5, a3 /* ac1 += t6*98 + t7*98 */
+ mult $ac2, $0, $0 /* ac2 = 0 */
+ dpa.w.ph $ac2, t7, a2 /* ac2 += t4*139 + t5*139 */
+ mult $ac3, $0, $0 /* ac3 = 0 */
+ dpa.w.ph $ac3, t5, a1 /* ac3 += t6*334 + t7*334 */
+ precrq.ph.w t0, t5, t7 /* t0 = t5|t6 */
+ addq.ph t2, t8, t4 /* tmp2 = t10 + t11 */
+ subq.ph t3, t8, t4 /* tmp3 = t10 - t11 */
+ extr.w t4, $ac0, 8
+ mult $0, $0 /* ac0 = 0 */
+ dpa.w.ph $ac0, t0, s1 /* ac0 += t5*181 + t6*181 */
+ extr.w t0, $ac1, 8 /* t0 = z5 */
+ extr.w t1, $ac2, 8 /* t1 = MULTIPLY(tmp10, 139) */
+ extr.w t7, $ac3, 8 /* t2 = MULTIPLY(tmp12, 334) */
+ extr.w t8, $ac0, 8 /* t8 = z3 = MULTIPLY(tmp11, 181) */
+ add t6, t1, t0 /* t6 = z2 */
+ add t7, t7, t0 /* t7 = z4 */
+ subq.ph t0, t5, t8 /* t0 = z13 = tmp7 - z3 */
+ addq.ph t8, t5, t8 /* t9 = z11 = tmp7 + z3 */
+ addq.ph t1, t0, t6 /* t1 = z13 + z2 */
+ subq.ph t6, t0, t6 /* t6 = z13 - z2 */
+ addq.ph t0, t8, t7 /* t0 = z11 + z4 */
+ subq.ph t7, t8, t7 /* t7 = z11 - z4 */
+ addq.ph t5, t4, t9
+ subq.ph t4, t9, t4
+ sh t2, 0(v0)
+ sh t5, 4(v0)
+ sh t3, 8(v0)
+ sh t4, 12(v0)
+ sh t1, 10(v0)
+ sh t6, 6(v0)
+ sh t0, 2(v0)
+ sh t7, 14(v0)
+ addiu v0, 16
+ bne v1, v0, 0b
+ nop
+ move v0, a0
+ addiu v1, v0, 16
+
+1:
+ lh t0, 0(v0) /* 0 */
+ lh t1, 16(v0) /* 8 */
+ lh t2, 32(v0) /* 16 */
+ lh t3, 48(v0) /* 24 */
+ lh t4, 64(v0) /* 32 */
+ lh t5, 80(v0) /* 40 */
+ lh t6, 96(v0) /* 48 */
+ lh t7, 112(v0) /* 56 */
+ add t8, t0, t7 /* t8 = tmp0 */
+ sub t7, t0, t7 /* t7 = tmp7 */
+ add t0, t1, t6 /* t0 = tmp1 */
+ sub t1, t1, t6 /* t1 = tmp6 */
+ add t6, t2, t5 /* t6 = tmp2 */
+ sub t5, t2, t5 /* t5 = tmp5 */
+ add t2, t3, t4 /* t2 = tmp3 */
+ sub t3, t3, t4 /* t3 = tmp4 */
+ add t4, t8, t2 /* t4 = tmp10 = tmp0 + tmp3 */
+ sub t8, t8, t2 /* t8 = tmp13 = tmp0 - tmp3 */
+ sub s0, t0, t6 /* s0 = tmp12 = tmp1 - tmp2 */
+ ins t8, s0, 16, 16 /* t8 = tmp12|tmp13 */
+ add t2, t0, t6 /* t2 = tmp11 = tmp1 + tmp2 */
+ mult $0, $0 /* ac0 = 0 */
+ dpa.w.ph $ac0, t8, s1 /* ac0 += t12*181 + t13*181 */
+ add s0, t4, t2 /* t8 = tmp10+tmp11 */
+ sub t4, t4, t2 /* t4 = tmp10-tmp11 */
+ sh s0, 0(v0)
+ sh t4, 64(v0)
+ extr.w t2, $ac0, 8 /* z1 = MULTIPLY(tmp12+tmp13,
+ FIX_0_707106781) */
+ addq.ph t4, t8, t2 /* t9 = tmp13 + z1 */
+ subq.ph t8, t8, t2 /* t2 = tmp13 - z1 */
+ sh t4, 32(v0)
+ sh t8, 96(v0)
+ add t3, t3, t5 /* t3 = tmp10 = tmp4 + tmp5 */
+ add t0, t5, t1 /* t0 = tmp11 = tmp5 + tmp6 */
+ add t1, t1, t7 /* t1 = tmp12 = tmp6 + tmp7 */
+ andi t4, a1, 0xffff
+ mul s0, t1, t4
+ sra s0, s0, 8 /* s0 = z4 =
+ MULTIPLY(tmp12, FIX_1_306562965) */
+ ins t1, t3, 16, 16 /* t1 = tmp10|tmp12 */
+ mult $0, $0 /* ac0 = 0 */
+ mulsa.w.ph $ac0, t1, a3 /* ac0 += t10*98 - t12*98 */
+ extr.w t8, $ac0, 8 /* z5 = MULTIPLY(tmp10-tmp12,
+ FIX_0_382683433) */
+ add t2, t7, t8 /* t2 = tmp7 + z5 */
+ sub t7, t7, t8 /* t7 = tmp7 - z5 */
+ andi t4, a2, 0xffff
+ mul t8, t3, t4
+ sra t8, t8, 8 /* t8 = z2 =
+ MULTIPLY(tmp10, FIX_0_541196100) */
+ andi t4, s1, 0xffff
+ mul t6, t0, t4
+ sra t6, t6, 8 /* t6 = z3 =
+ MULTIPLY(tmp11, FIX_0_707106781) */
+ add t0, t6, t8 /* t0 = z3 + z2 */
+ sub t1, t6, t8 /* t1 = z3 - z2 */
+ add t3, t6, s0 /* t3 = z3 + z4 */
+ sub t4, t6, s0 /* t4 = z3 - z4 */
+ sub t5, t2, t1 /* t5 = dataptr[5] */
+ sub t6, t7, t0 /* t6 = dataptr[3] */
+ add t3, t2, t3 /* t3 = dataptr[1] */
+ add t4, t7, t4 /* t4 = dataptr[7] */
+ sh t5, 80(v0)
+ sh t6, 48(v0)
+ sh t3, 16(v0)
+ sh t4, 112(v0)
+ addiu v0, 2
+ bne v0, v1, 1b
+ nop
+
+ RESTORE_REGS_FROM_STACK 8, s0, s1
+
+ j ra
+ nop
+END(jsimd_fdct_ifast_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 16, s0, s1, s2
+
+ addiu v0, a2, 124 /* v0 = workspace_end */
+ lh t0, 0(a2)
+ lh t1, 0(a1)
+ lh t2, 128(a1)
+ sra t3, t0, 15
+ sll t3, t3, 1
+ addiu t3, t3, 1
+ mul t0, t0, t3
+ lh t4, 384(a1)
+ lh t5, 130(a1)
+ lh t6, 2(a2)
+ lh t7, 2(a1)
+ lh t8, 386(a1)
+
+1:
+ andi t1, 0xffff
+ add t9, t0, t2
+ andi t9, 0xffff
+ mul v1, t9, t1
+ sra s0, t6, 15
+ sll s0, s0, 1
+ addiu s0, s0, 1
+ addiu t9, t4, 16
+ srav v1, v1, t9
+ mul v1, v1, t3
+ mul t6, t6, s0
+ andi t7, 0xffff
+ addiu a2, a2, 4
+ addiu a1, a1, 4
+ add s1, t6, t5
+ andi s1, 0xffff
+ sh v1, 0(a0)
+
+ mul s2, s1, t7
+ addiu s1, t8, 16
+ srav s2, s2, s1
+ mul s2, s2, s0
+ lh t0, 0(a2)
+ lh t1, 0(a1)
+ sra t3, t0, 15
+ sll t3, t3, 1
+ addiu t3, t3, 1
+ mul t0, t0, t3
+ lh t2, 128(a1)
+ lh t4, 384(a1)
+ lh t5, 130(a1)
+ lh t8, 386(a1)
+ lh t6, 2(a2)
+ lh t7, 2(a1)
+ sh s2, 2(a0)
+ lh t0, 0(a2)
+ sra t3, t0, 15
+ sll t3, t3, 1
+ addiu t3, t3, 1
+ mul t0, t0, t3
+ bne a2, v0, 1b
+ addiu a0, a0, 4
+
+ andi t1, 0xffff
+ add t9, t0, t2
+ andi t9, 0xffff
+ mul v1, t9, t1
+ sra s0, t6, 15
+ sll s0, s0, 1
+ addiu s0, s0, 1
+ addiu t9, t4, 16
+ srav v1, v1, t9
+ mul v1, v1, t3
+ mul t6, t6, s0
+ andi t7, 0xffff
+ sh v1, 0(a0)
+ add s1, t6, t5
+ andi s1, 0xffff
+ mul s2, s1, t7
+ addiu s1, t8, 16
+ addiu a2, a2, 4
+ addiu a1, a1, 4
+ srav s2, s2, s1
+ mul s2, s2, s0
+ sh s2, 2(a0)
+
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2
+
+ j ra
+ nop
+
+END(jsimd_quantize_dspr2)
+
+
+#ifndef __mips_soft_float
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_float_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+ .set at
+
+ li t1, 0x46800100 /* integer representation 16384.5 */
+ mtc1 t1, f0
+ li t0, 63
+0:
+ lwc1 f2, 0(a2)
+ lwc1 f10, 0(a1)
+ lwc1 f4, 4(a2)
+ lwc1 f12, 4(a1)
+ lwc1 f6, 8(a2)
+ lwc1 f14, 8(a1)
+ lwc1 f8, 12(a2)
+ lwc1 f16, 12(a1)
+ madd.s f2, f0, f2, f10
+ madd.s f4, f0, f4, f12
+ madd.s f6, f0, f6, f14
+ madd.s f8, f0, f8, f16
+ lwc1 f10, 16(a1)
+ lwc1 f12, 20(a1)
+ trunc.w.s f2, f2
+ trunc.w.s f4, f4
+ trunc.w.s f6, f6
+ trunc.w.s f8, f8
+ lwc1 f14, 24(a1)
+ lwc1 f16, 28(a1)
+ mfc1 t1, f2
+ mfc1 t2, f4
+ mfc1 t3, f6
+ mfc1 t4, f8
+ lwc1 f2, 16(a2)
+ lwc1 f4, 20(a2)
+ lwc1 f6, 24(a2)
+ lwc1 f8, 28(a2)
+ madd.s f2, f0, f2, f10
+ madd.s f4, f0, f4, f12
+ madd.s f6, f0, f6, f14
+ madd.s f8, f0, f8, f16
+ addiu t1, t1, -16384
+ addiu t2, t2, -16384
+ addiu t3, t3, -16384
+ addiu t4, t4, -16384
+ trunc.w.s f2, f2
+ trunc.w.s f4, f4
+ trunc.w.s f6, f6
+ trunc.w.s f8, f8
+ sh t1, 0(a0)
+ sh t2, 2(a0)
+ sh t3, 4(a0)
+ sh t4, 6(a0)
+ mfc1 t1, f2
+ mfc1 t2, f4
+ mfc1 t3, f6
+ mfc1 t4, f8
+ addiu t0, t0, -8
+ addiu a2, a2, 32
+ addiu a1, a1, 32
+ addiu t1, t1, -16384
+ addiu t2, t2, -16384
+ addiu t3, t3, -16384
+ addiu t4, t4, -16384
+ sh t1, 8(a0)
+ sh t2, 10(a0)
+ sh t3, 12(a0)
+ sh t4, 14(a0)
+ bgez t0, 0b
+ addiu a0, a0, 16
+
+ j ra
+ nop
+
+END(jsimd_quantize_float_dspr2)
+
+#endif
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_2x2_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+ addiu sp, sp, -40
+ move v0, sp
+ addiu s2, zero, 29692
+ addiu s3, zero, -10426
+ addiu s4, zero, 6967
+ addiu s5, zero, -5906
+ lh t0, 0(a1) /* t0 = inptr[DCTSIZE*0] */
+ lh t5, 0(a0) /* t5 = quantptr[DCTSIZE*0] */
+ lh t1, 48(a1) /* t1 = inptr[DCTSIZE*3] */
+ lh t6, 48(a0) /* t6 = quantptr[DCTSIZE*3] */
+ mul t4, t5, t0
+ lh t0, 16(a1) /* t0 = inptr[DCTSIZE*1] */
+ lh t5, 16(a0) /* t5 = quantptr[DCTSIZE*1] */
+ mul t6, t6, t1
+ mul t5, t5, t0
+ lh t2, 80(a1) /* t2 = inptr[DCTSIZE*5] */
+ lh t7, 80(a0) /* t7 = quantptr[DCTSIZE*5] */
+ lh t3, 112(a1) /* t3 = inptr[DCTSIZE*7] */
+ lh t8, 112(a0) /* t8 = quantptr[DCTSIZE*7] */
+ mul t7, t7, t2
+ mult zero, zero
+ mul t8, t8, t3
+ li s0, 0x73FCD746 /* s0 = (29692 << 16) | (-10426 & 0xffff) */
+ li s1, 0x1B37E8EE /* s1 = (6967 << 16) | (-5906 & 0xffff) */
+ ins t6, t5, 16, 16 /* t6 = t5|t6 */
+ sll t4, t4, 15
+ dpa.w.ph $ac0, t6, s0
+ lh t1, 2(a1)
+ lh t6, 2(a0)
+ ins t8, t7, 16, 16 /* t8 = t7|t8 */
+ dpa.w.ph $ac0, t8, s1
+ mflo t0, $ac0
+ mul t5, t6, t1
+ lh t1, 18(a1)
+ lh t6, 18(a0)
+ lh t2, 50(a1)
+ lh t7, 50(a0)
+ mul t6, t6, t1
+ subu t8, t4, t0
+ mul t7, t7, t2
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ lh t1, 82(a1)
+ lh t2, 82(a0)
+ lh t3, 114(a1)
+ lh t4, 114(a0)
+ shra_r.w t8, t8, 13
+ mul t1, t1, t2
+ mul t3, t3, t4
+ sw t0, 0(v0)
+ sw t8, 20(v0)
+ sll t4, t5, 15
+ ins t7, t6, 16, 16
+ mult zero, zero
+ dpa.w.ph $ac0, t7, s0
+ ins t3, t1, 16, 16
+ lh t1, 6(a1)
+ lh t6, 6(a0)
+ dpa.w.ph $ac0, t3, s1
+ mflo t0, $ac0
+ mul t5, t6, t1
+ lh t1, 22(a1)
+ lh t6, 22(a0)
+ lh t2, 54(a1)
+ lh t7, 54(a0)
+ mul t6, t6, t1
+ subu t8, t4, t0
+ mul t7, t7, t2
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ lh t1, 86(a1)
+ lh t2, 86(a0)
+ lh t3, 118(a1)
+ lh t4, 118(a0)
+ shra_r.w t8, t8, 13
+ mul t1, t1, t2
+ mul t3, t3, t4
+ sw t0, 4(v0)
+ sw t8, 24(v0)
+ sll t4, t5, 15
+ ins t7, t6, 16, 16
+ mult zero, zero
+ dpa.w.ph $ac0, t7, s0
+ ins t3, t1, 16, 16
+ lh t1, 10(a1)
+ lh t6, 10(a0)
+ dpa.w.ph $ac0, t3, s1
+ mflo t0, $ac0
+ mul t5, t6, t1
+ lh t1, 26(a1)
+ lh t6, 26(a0)
+ lh t2, 58(a1)
+ lh t7, 58(a0)
+ mul t6, t6, t1
+ subu t8, t4, t0
+ mul t7, t7, t2
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ lh t1, 90(a1)
+ lh t2, 90(a0)
+ lh t3, 122(a1)
+ lh t4, 122(a0)
+ shra_r.w t8, t8, 13
+ mul t1, t1, t2
+ mul t3, t3, t4
+ sw t0, 8(v0)
+ sw t8, 28(v0)
+ sll t4, t5, 15
+ ins t7, t6, 16, 16
+ mult zero, zero
+ dpa.w.ph $ac0, t7, s0
+ ins t3, t1, 16, 16
+ lh t1, 14(a1)
+ lh t6, 14(a0)
+ dpa.w.ph $ac0, t3, s1
+ mflo t0, $ac0
+ mul t5, t6, t1
+ lh t1, 30(a1)
+ lh t6, 30(a0)
+ lh t2, 62(a1)
+ lh t7, 62(a0)
+ mul t6, t6, t1
+ subu t8, t4, t0
+ mul t7, t7, t2
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ lh t1, 94(a1)
+ lh t2, 94(a0)
+ lh t3, 126(a1)
+ lh t4, 126(a0)
+ shra_r.w t8, t8, 13
+ mul t1, t1, t2
+ mul t3, t3, t4
+ sw t0, 12(v0)
+ sw t8, 32(v0)
+ sll t4, t5, 15
+ ins t7, t6, 16, 16
+ mult zero, zero
+ dpa.w.ph $ac0, t7, s0
+ ins t3, t1, 16, 16
+ dpa.w.ph $ac0, t3, s1
+ mflo t0, $ac0
+ lw t9, 0(a2)
+ lw t3, 0(v0)
+ lw t7, 4(v0)
+ lw t1, 8(v0)
+ addu t9, t9, a3
+ sll t3, t3, 15
+ subu t8, t4, t0
+ addu t0, t4, t0
+ shra_r.w t0, t0, 13
+ shra_r.w t8, t8, 13
+ sw t0, 16(v0)
+ sw t8, 36(v0)
+ lw t5, 12(v0)
+ lw t6, 16(v0)
+ mult t7, s2
+ madd t1, s3
+ madd t5, s4
+ madd t6, s5
+ lw t5, 24(v0)
+ lw t7, 28(v0)
+ mflo t0, $ac0
+ lw t8, 32(v0)
+ lw t2, 36(v0)
+ mult $ac1, t5, s2
+ madd $ac1, t7, s3
+ madd $ac1, t8, s4
+ madd $ac1, t2, s5
+ addu t1, t3, t0
+ subu t6, t3, t0
+ shra_r.w t1, t1, 20
+ shra_r.w t6, t6, 20
+ mflo t4, $ac1
+ shll_s.w t1, t1, 24
+ shll_s.w t6, t6, 24
+ sra t1, t1, 24
+ sra t6, t6, 24
+ addiu t1, t1, 128
+ addiu t6, t6, 128
+ lw t0, 20(v0)
+ sb t1, 0(t9)
+ sb t6, 1(t9)
+ sll t0, t0, 15
+ lw t9, 4(a2)
+ addu t1, t0, t4
+ subu t6, t0, t4
+ addu t9, t9, a3
+ shra_r.w t1, t1, 20
+ shra_r.w t6, t6, 20
+ shll_s.w t1, t1, 24
+ shll_s.w t6, t6, 24
+ sra t1, t1, 24
+ sra t6, t6, 24
+ addiu t1, t1, 128
+ addiu t6, t6, 128
+ sb t1, 0(t9)
+ sb t6, 1(t9)
+ addiu sp, sp, 40
+
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+ j ra
+ nop
+
+END(jsimd_idct_2x2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_4x4_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ * 16(sp) = workspace[DCTSIZE*4] (buffers data between passes)
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw v1, 48(sp)
+ move t0, a1
+ move t1, v1
+ li t9, 4
+ li s0, 0x2e75f93e
+ li s1, 0x21f9ba79
+ li s2, 0xecc2efb0
+ li s3, 0x52031ccd
+
+0:
+ lh s6, 32(t0) /* inptr[DCTSIZE*2] */
+ lh t6, 32(a0) /* quantptr[DCTSIZE*2] */
+ lh s7, 96(t0) /* inptr[DCTSIZE*6] */
+ lh t7, 96(a0) /* quantptr[DCTSIZE*6] */
+ mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
+ quantptr[DCTSIZE*2]) */
+ lh s4, 0(t0) /* inptr[DCTSIZE*0] */
+ mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
+ quantptr[DCTSIZE*6]) */
+ lh s5, 0(a0) /* quantptr[0] */
+ li s6, 15137
+ li s7, 6270
+ mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */
+ mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
+ quantptr[DCTSIZE*2]) */
+ lh t5, 112(t0) /* inptr[DCTSIZE*7] */
+ mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
+ quantptr[DCTSIZE*6]) */
+ lh s4, 112(a0) /* quantptr[DCTSIZE*7] */
+ lh v0, 80(t0) /* inptr[DCTSIZE*5] */
+ lh s5, 80(a0) /* quantptr[DCTSIZE*5] */
+ lh s6, 48(a0) /* quantptr[DCTSIZE*3] */
+ sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */
+ lh s7, 16(a0) /* quantptr[DCTSIZE*1] */
+ lh t8, 16(t0) /* inptr[DCTSIZE*1] */
+ subu t6, t6, t7 /* tmp2 =
+ MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
+ lh t7, 48(t0) /* inptr[DCTSIZE*3] */
+ mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] *
+ quantptr[DCTSIZE*7]) */
+ mul v0, s5, v0 /* z2 = (inptr[DCTSIZE*5] *
+ quantptr[DCTSIZE*5]) */
+ mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] *
+ quantptr[DCTSIZE*3]) */
+ mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] *
+ quantptr[DCTSIZE*1]) */
+ addu t3, t2, t6 /* tmp10 = tmp0 + z2 */
+ subu t4, t2, t6 /* tmp10 = tmp0 - z2 */
+ mult $ac0, zero, zero
+ mult $ac1, zero, zero
+ ins t5, v0, 16, 16
+ ins t7, t8, 16, 16
+ addiu t9, t9, -1
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ mflo s4, $ac0
+ mflo s5, $ac1
+ addiu a0, a0, 2
+ addiu t1, t1, 4
+ addiu t0, t0, 2
+ addu t6, t4, s4
+ subu t5, t4, s4
+ addu s6, t3, s5
+ subu s7, t3, s5
+ shra_r.w t6, t6, 12 /* DESCALE(tmp12 + temp1, 12) */
+ shra_r.w t5, t5, 12 /* DESCALE(tmp12 - temp1, 12) */
+ shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */
+ shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */
+ sw t6, 28(t1)
+ sw t5, 60(t1)
+ sw s6, -4(t1)
+ bgtz t9, 0b
+ sw s7, 92(t1)
+ /* second loop three pass */
+ li t9, 3
+1:
+ lh s6, 34(t0) /* inptr[DCTSIZE*2] */
+ lh t6, 34(a0) /* quantptr[DCTSIZE*2] */
+ lh s7, 98(t0) /* inptr[DCTSIZE*6] */
+ lh t7, 98(a0) /* quantptr[DCTSIZE*6] */
+ mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
+ quantptr[DCTSIZE*2]) */
+ lh s4, 2(t0) /* inptr[DCTSIZE*0] */
+ mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
+ quantptr[DCTSIZE*6]) */
+ lh s5, 2(a0) /* quantptr[DCTSIZE*0] */
+ li s6, 15137
+ li s7, 6270
+ mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */
+ mul v0, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
+ quantptr[DCTSIZE*2]) */
+ lh t5, 114(t0) /* inptr[DCTSIZE*7] */
+ mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
+ quantptr[DCTSIZE*6]) */
+ lh s4, 114(a0) /* quantptr[DCTSIZE*7] */
+ lh s5, 82(a0) /* quantptr[DCTSIZE*5] */
+ lh t6, 82(t0) /* inptr[DCTSIZE*5] */
+ sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */
+ lh s6, 50(a0) /* quantptr[DCTSIZE*3] */
+ lh t8, 18(t0) /* inptr[DCTSIZE*1] */
+ subu v0, v0, t7 /* tmp2 =
+ MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
+ lh t7, 50(t0) /* inptr[DCTSIZE*3] */
+ lh s7, 18(a0) /* quantptr[DCTSIZE*1] */
+ mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] *
+ quantptr[DCTSIZE*7]) */
+ mul t6, s5, t6 /* z2 = (inptr[DCTSIZE*5] *
+ quantptr[DCTSIZE*5]) */
+ mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] *
+ quantptr[DCTSIZE*3]) */
+ mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] *
+ quantptr[DCTSIZE*1]) */
+ addu t3, t2, v0 /* tmp10 = tmp0 + z2 */
+ subu t4, t2, v0 /* tmp10 = tmp0 - z2 */
+ mult $ac0, zero, zero
+ mult $ac1, zero, zero
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ mflo t5, $ac0
+ mflo t6, $ac1
+ addiu t9, t9, -1
+ addiu t0, t0, 2
+ addiu a0, a0, 2
+ addiu t1, t1, 4
+ addu s5, t4, t5
+ subu s4, t4, t5
+ addu s6, t3, t6
+ subu s7, t3, t6
+ shra_r.w s5, s5, 12 /* DESCALE(tmp12 + temp1, 12) */
+ shra_r.w s4, s4, 12 /* DESCALE(tmp12 - temp1, 12) */
+ shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */
+ shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */
+ sw s5, 32(t1)
+ sw s4, 64(t1)
+ sw s6, 0(t1)
+ bgtz t9, 1b
+ sw s7, 96(t1)
+ move t1, v1
+ li s4, 15137
+ lw s6, 8(t1) /* wsptr[2] */
+ li s5, 6270
+ lw s7, 24(t1) /* wsptr[6] */
+ mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
+ FIX_1_847759065) */
+ lw t2, 0(t1) /* wsptr[0] */
+ mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
+ -FIX_0_765366865) */
+ lh t5, 28(t1) /* wsptr[7] */
+ lh t6, 20(t1) /* wsptr[5] */
+ lh t7, 12(t1) /* wsptr[3] */
+ lh t8, 4(t1) /* wsptr[1] */
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ mult $ac0, zero, zero
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ mult $ac1, zero, zero
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ sll t2, t2, 14 /* tmp0 =
+ ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+ mflo s6, $ac0
+ /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+ MULTIPLY(wsptr[6], -FIX_0_765366865) */
+ subu s4, s4, s5
+ addu t3, t2, s4 /* tmp10 = tmp0 + z2 */
+ mflo s7, $ac1
+ subu t4, t2, s4 /* tmp10 = tmp0 - z2 */
+ addu t7, t4, s6
+ subu t8, t4, s6
+ addu t5, t3, s7
+ subu t6, t3, s7
+ shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */
+ shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */
+ shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */
+ shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */
+ sll s4, t9, 2
+ lw v0, 0(a2) /* output_buf[ctr] */
+ shll_s.w t5, t5, 24
+ shll_s.w t6, t6, 24
+ shll_s.w t7, t7, 24
+ shll_s.w t8, t8, 24
+ sra t5, t5, 24
+ sra t6, t6, 24
+ sra t7, t7, 24
+ sra t8, t8, 24
+ addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
+ addiu t5, t5, 128
+ addiu t6, t6, 128
+ addiu t7, t7, 128
+ addiu t8, t8, 128
+ sb t5, 0(v0)
+ sb t7, 1(v0)
+ sb t8, 2(v0)
+ sb t6, 3(v0)
+ /* 2 */
+ li s4, 15137
+ lw s6, 40(t1) /* wsptr[2] */
+ li s5, 6270
+ lw s7, 56(t1) /* wsptr[6] */
+ mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
+ FIX_1_847759065) */
+ lw t2, 32(t1) /* wsptr[0] */
+ mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
+ -FIX_0_765366865) */
+ lh t5, 60(t1) /* wsptr[7] */
+ lh t6, 52(t1) /* wsptr[5] */
+ lh t7, 44(t1) /* wsptr[3] */
+ lh t8, 36(t1) /* wsptr[1] */
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ mult $ac0, zero, zero
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ mult $ac1, zero, zero
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ sll t2, t2, 14 /* tmp0 =
+ ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+ mflo s6, $ac0
+ /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+ MULTIPLY(wsptr[6], -FIX_0_765366865) */
+ subu s4, s4, s5
+ addu t3, t2, s4 /* tmp10 = tmp0 + z2 */
+ mflo s7, $ac1
+ subu t4, t2, s4 /* tmp10 = tmp0 - z2 */
+ addu t7, t4, s6
+ subu t8, t4, s6
+ addu t5, t3, s7
+ subu t6, t3, s7
+ shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2,
+ CONST_BITS-PASS1_BITS+1) */
+ shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2,
+ CONST_BITS-PASS1_BITS+1) */
+ shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1,
+ CONST_BITS-PASS1_BITS+1) */
+ shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1,
+ CONST_BITS-PASS1_BITS+1) */
+ sll s4, t9, 2
+ lw v0, 4(a2) /* output_buf[ctr] */
+ shll_s.w t5, t5, 24
+ shll_s.w t6, t6, 24
+ shll_s.w t7, t7, 24
+ shll_s.w t8, t8, 24
+ sra t5, t5, 24
+ sra t6, t6, 24
+ sra t7, t7, 24
+ sra t8, t8, 24
+ addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
+ addiu t5, t5, 128
+ addiu t6, t6, 128
+ addiu t7, t7, 128
+ addiu t8, t8, 128
+ sb t5, 0(v0)
+ sb t7, 1(v0)
+ sb t8, 2(v0)
+ sb t6, 3(v0)
+ /* 3 */
+ li s4, 15137
+ lw s6, 72(t1) /* wsptr[2] */
+ li s5, 6270
+ lw s7, 88(t1) /* wsptr[6] */
+ mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
+ FIX_1_847759065) */
+ lw t2, 64(t1) /* wsptr[0] */
+ mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
+ -FIX_0_765366865) */
+ lh t5, 92(t1) /* wsptr[7] */
+ lh t6, 84(t1) /* wsptr[5] */
+ lh t7, 76(t1) /* wsptr[3] */
+ lh t8, 68(t1) /* wsptr[1] */
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ mult $ac0, zero, zero
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ mult $ac1, zero, zero
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ sll t2, t2, 14 /* tmp0 =
+ ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+ mflo s6, $ac0
+ /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+ MULTIPLY(wsptr[6], -FIX_0_765366865) */
+ subu s4, s4, s5
+ addu t3, t2, s4 /* tmp10 = tmp0 + z2 */
+ mflo s7, $ac1
+ subu t4, t2, s4 /* tmp10 = tmp0 - z2 */
+ addu t7, t4, s6
+ subu t8, t4, s6
+ addu t5, t3, s7
+ subu t6, t3, s7
+ shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */
+ shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */
+ shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */
+ shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */
+ sll s4, t9, 2
+ lw v0, 8(a2) /* output_buf[ctr] */
+ shll_s.w t5, t5, 24
+ shll_s.w t6, t6, 24
+ shll_s.w t7, t7, 24
+ shll_s.w t8, t8, 24
+ sra t5, t5, 24
+ sra t6, t6, 24
+ sra t7, t7, 24
+ sra t8, t8, 24
+ addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
+ addiu t5, t5, 128
+ addiu t6, t6, 128
+ addiu t7, t7, 128
+ addiu t8, t8, 128
+ sb t5, 0(v0)
+ sb t7, 1(v0)
+ sb t8, 2(v0)
+ sb t6, 3(v0)
+ li s4, 15137
+ lw s6, 104(t1) /* wsptr[2] */
+ li s5, 6270
+ lw s7, 120(t1) /* wsptr[6] */
+ mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
+ FIX_1_847759065) */
+ lw t2, 96(t1) /* wsptr[0] */
+ mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
+ -FIX_0_765366865) */
+ lh t5, 124(t1) /* wsptr[7] */
+ lh t6, 116(t1) /* wsptr[5] */
+ lh t7, 108(t1) /* wsptr[3] */
+ lh t8, 100(t1) /* wsptr[1] */
+ ins t5, t6, 16, 16
+ ins t7, t8, 16, 16
+ mult $ac0, zero, zero
+ dpa.w.ph $ac0, t5, s0
+ dpa.w.ph $ac0, t7, s1
+ mult $ac1, zero, zero
+ dpa.w.ph $ac1, t5, s2
+ dpa.w.ph $ac1, t7, s3
+ sll t2, t2, 14 /* tmp0 =
+ ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+ mflo s6, $ac0
+ /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+ MULTIPLY(wsptr[6], -FIX_0_765366865) */
+ subu s4, s4, s5
+ addu t3, t2, s4 /* tmp10 = tmp0 + z2; */
+ mflo s7, $ac1
+ subu t4, t2, s4 /* tmp10 = tmp0 - z2; */
+ addu t7, t4, s6
+ subu t8, t4, s6
+ addu t5, t3, s7
+ subu t6, t3, s7
+ shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */
+ shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */
+ shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */
+ shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */
+ sll s4, t9, 2
+ lw v0, 12(a2) /* output_buf[ctr] */
+ shll_s.w t5, t5, 24
+ shll_s.w t6, t6, 24
+ shll_s.w t7, t7, 24
+ shll_s.w t8, t8, 24
+ sra t5, t5, 24
+ sra t6, t6, 24
+ sra t7, t7, 24
+ sra t8, t8, 24
+ addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
+ addiu t5, t5, 128
+ addiu t6, t6, 128
+ addiu t7, t7, 128
+ addiu t8, t8, 128
+ sb t5, 0(v0)
+ sb t7, 1(v0)
+ sb t8, 2(v0)
+ sb t6, 3(v0)
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_idct_4x4_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_6x6_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+ .set at
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ addiu sp, sp, -144
+ move v0, sp
+ addiu v1, v0, 24
+ addiu t9, zero, 5793
+ addiu s0, zero, 10033
+ addiu s1, zero, 2998
+
+1:
+ lh s2, 0(a0) /* q0 = quantptr[ 0] */
+ lh s3, 32(a0) /* q1 = quantptr[16] */
+ lh s4, 64(a0) /* q2 = quantptr[32] */
+ lh t2, 64(a1) /* tmp2 = inptr[32] */
+ lh t1, 32(a1) /* tmp1 = inptr[16] */
+ lh t0, 0(a1) /* tmp0 = inptr[ 0] */
+ mul t2, t2, s4 /* tmp2 = tmp2 * q2 */
+ mul t1, t1, s3 /* tmp1 = tmp1 * q1 */
+ mul t0, t0, s2 /* tmp0 = tmp0 * q0 */
+ lh t6, 16(a1) /* z1 = inptr[ 8] */
+ lh t8, 80(a1) /* z3 = inptr[40] */
+ lh t7, 48(a1) /* z2 = inptr[24] */
+ lh s2, 16(a0) /* q0 = quantptr[ 8] */
+ lh s4, 80(a0) /* q2 = quantptr[40] */
+ lh s3, 48(a0) /* q1 = quantptr[24] */
+ mul t2, t2, t9 /* tmp2 = tmp2 * 5793 */
+ mul t1, t1, s0 /* tmp1 = tmp1 * 10033 */
+ sll t0, t0, 13 /* tmp0 = tmp0 << 13 */
+ mul t6, t6, s2 /* z1 = z1 * q0 */
+ mul t8, t8, s4 /* z3 = z3 * q2 */
+ mul t7, t7, s3 /* z2 = z2 * q1 */
+ addu t3, t0, t2 /* tmp10 = tmp0 + tmp2 */
+ sll t2, t2, 1 /* tmp2 = tmp2 << 2 */
+ subu t4, t0, t2 /* tmp11 = tmp0 - tmp2; */
+ subu t5, t3, t1 /* tmp12 = tmp10 - tmp1 */
+ addu t3, t3, t1 /* tmp10 = tmp10 + tmp1 */
+ addu t1, t6, t8 /* tmp1 = z1 + z3 */
+ mul t1, t1, s1 /* tmp1 = tmp1 * 2998 */
+ shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */
+ subu t2, t6, t8 /* tmp2 = z1 - z3 */
+ subu t2, t2, t7 /* tmp2 = tmp2 - z2 */
+ sll t2, t2, 2 /* tmp2 = tmp2 << 2 */
+ addu t0, t6, t7 /* tmp0 = z1 + z2 */
+ sll t0, t0, 13 /* tmp0 = tmp0 << 13 */
+ subu s2, t8, t7 /* q0 = z3 - z2 */
+ sll s2, s2, 13 /* q0 = q0 << 13 */
+ addu t0, t0, t1 /* tmp0 = tmp0 + tmp1 */
+ addu t1, s2, t1 /* tmp1 = q0 + tmp1 */
+ addu s2, t4, t2 /* q0 = tmp11 + tmp2 */
+ subu s3, t4, t2 /* q1 = tmp11 - tmp2 */
+ addu t6, t3, t0 /* z1 = tmp10 + tmp0 */
+ subu t7, t3, t0 /* z2 = tmp10 - tmp0 */
+ addu t4, t5, t1 /* tmp11 = tmp12 + tmp1 */
+ subu t5, t5, t1 /* tmp12 = tmp12 - tmp1 */
+ shra_r.w t6, t6, 11 /* z1 = (z1 + 1024) >> 11 */
+ shra_r.w t7, t7, 11 /* z2 = (z2 + 1024) >> 11 */
+ shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */
+ shra_r.w t5, t5, 11 /* tmp12 = (tmp12 + 1024) >> 11 */
+ sw s2, 24(v0)
+ sw s3, 96(v0)
+ sw t6, 0(v0)
+ sw t7, 120(v0)
+ sw t4, 48(v0)
+ sw t5, 72(v0)
+ addiu v0, v0, 4
+ addiu a1, a1, 2
+ bne v0, v1, 1b
+ addiu a0, a0, 2
+
+ /* Pass 2: process 6 rows from work array, store into output array. */
+ move v0, sp
+ addiu v1, v0, 144
+
+2:
+ lw t0, 0(v0)
+ lw t2, 16(v0)
+ lw s5, 0(a2)
+ addiu t0, t0, 16
+ sll t0, t0, 13
+ mul t3, t2, t9
+ lw t6, 4(v0)
+ lw t8, 20(v0)
+ lw t7, 12(v0)
+ addu s5, s5, a3
+ addu s6, t6, t8
+ mul s6, s6, s1
+ addu t1, t0, t3
+ subu t4, t0, t3
+ subu t4, t4, t3
+ lw t3, 8(v0)
+ mul t0, t3, s0
+ addu s7, t6, t7
+ sll s7, s7, 13
+ addu s7, s6, s7
+ subu t2, t8, t7
+ sll t2, t2, 13
+ addu t2, s6, t2
+ subu s6, t6, t7
+ subu s6, s6, t8
+ sll s6, s6, 13
+ addu t3, t1, t0
+ subu t5, t1, t0
+ addu t6, t3, s7
+ subu t3, t3, s7
+ addu t7, t4, s6
+ subu t4, t4, s6
+ addu t8, t5, t2
+ subu t5, t5, t2
+ shll_s.w t6, t6, 6
+ shll_s.w t3, t3, 6
+ shll_s.w t7, t7, 6
+ shll_s.w t4, t4, 6
+ shll_s.w t8, t8, 6
+ shll_s.w t5, t5, 6
+ sra t6, t6, 24
+ addiu t6, t6, 128
+ sra t3, t3, 24
+ addiu t3, t3, 128
+ sb t6, 0(s5)
+ sra t7, t7, 24
+ addiu t7, t7, 128
+ sb t3, 5(s5)
+ sra t4, t4, 24
+ addiu t4, t4, 128
+ sb t7, 1(s5)
+ sra t8, t8, 24
+ addiu t8, t8, 128
+ sb t4, 4(s5)
+ addiu v0, v0, 24
+ sra t5, t5, 24
+ addiu t5, t5, 128
+ sb t8, 2(s5)
+ addiu a2, a2, 4
+ bne v0, v1, 2b
+ sb t5, 3(s5)
+
+ addiu sp, sp, 144
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+
+END(jsimd_idct_6x6_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = workspace
+ */
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+ li a3, 8
+
+1:
+ /* odd part */
+ lh t0, 48(a1)
+ lh t1, 48(a0)
+ lh t2, 16(a1)
+ lh t3, 16(a0)
+ lh t4, 80(a1)
+ lh t5, 80(a0)
+ lh t6, 112(a1)
+ lh t7, 112(a0)
+ mul t0, t0, t1 /* z2 */
+ mul t1, t2, t3 /* z1 */
+ mul t2, t4, t5 /* z3 */
+ mul t3, t6, t7 /* z4 */
+ li t4, 10703 /* FIX(1.306562965) */
+ li t5, 4433 /* FIX_0_541196100 */
+ li t6, 7053 /* FIX(0.860918669) */
+ mul t4, t0, t4 /* tmp11 */
+ mul t5, t0, t5 /* -tmp14 */
+ addu t7, t1, t2 /* tmp10 */
+ addu t8, t7, t3 /* tmp10 + z4 */
+ mul t6, t6, t8 /* tmp15 */
+ li t8, 2139 /* FIX(0.261052384) */
+ mul t8, t7, t8 /* MULTIPLY(tmp10, FIX(0.261052384)) */
+ li t7, 2295 /* FIX(0.280143716) */
+ mul t7, t1, t7 /* MULTIPLY(z1, FIX(0.280143716)) */
+ addu t9, t2, t3 /* z3 + z4 */
+ li s0, 8565 /* FIX(1.045510580) */
+ mul t9, t9, s0 /* -tmp13 */
+ li s0, 12112 /* FIX(1.478575242) */
+ mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242) */
+ li s1, 12998 /* FIX(1.586706681) */
+ mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */
+ li s2, 5540 /* FIX(0.676326758) */
+ mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */
+ li s3, 16244 /* FIX(1.982889723) */
+ mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */
+ subu t1, t1, t3 /* z1-=z4 */
+ subu t0, t0, t2 /* z2-=z3 */
+ addu t2, t0, t1 /* z1+z2 */
+ li t3, 4433 /* FIX_0_541196100 */
+ mul t2, t2, t3 /* z3 */
+ li t3, 6270 /* FIX_0_765366865 */
+ mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */
+ li t3, 15137 /* FIX_0_765366865 */
+ mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */
+ addu t8, t6, t8 /* tmp12 */
+ addu t3, t8, t4 /* tmp12 + tmp11 */
+ addu t3, t3, t7 /* tmp10 */
+ subu t8, t8, t9 /* tmp12 + tmp13 */
+ addu s0, t5, s0
+ subu t8, t8, s0 /* tmp12 */
+ subu t9, t6, t9
+ subu s1, s1, t4
+ addu t9, t9, s1 /* tmp13 */
+ subu t6, t6, t5
+ subu t6, t6, s2
+ subu t6, t6, s3 /* tmp15 */
+ /* even part start */
+ lh t4, 64(a1)
+ lh t5, 64(a0)
+ lh t7, 32(a1)
+ lh s0, 32(a0)
+ lh s1, 0(a1)
+ lh s2, 0(a0)
+ lh s3, 96(a1)
+ lh v0, 96(a0)
+ mul t4, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*4],
+ quantptr[DCTSIZE*4]) */
+ mul t5, t7, s0 /* DEQUANTIZE(inptr[DCTSIZE*2],
+ quantptr[DCTSIZE*2]) */
+ mul t7, s1, s2 /* DEQUANTIZE(inptr[DCTSIZE*0],
+ quantptr[DCTSIZE*0]) */
+ mul s0, s3, v0 /* DEQUANTIZE(inptr[DCTSIZE*6],
+ quantptr[DCTSIZE*6]) */
+ /* odd part end */
+ addu t1, t2, t1 /* tmp11 */
+ subu t0, t2, t0 /* tmp14 */
+ /* update counter and pointers */
+ addiu a3, a3, -1
+ addiu a0, a0, 2
+ addiu a1, a1, 2
+ /* even part rest */
+ li s1, 10033
+ li s2, 11190
+ mul t4, t4, s1 /* z4 */
+ mul s1, t5, s2 /* z4 */
+ sll t5, t5, 13 /* z1 */
+ sll t7, t7, 13
+ addiu t7, t7, 1024 /* z3 */
+ sll s0, s0, 13 /* z2 */
+ addu s2, t7, t4 /* tmp10 */
+ subu t4, t7, t4 /* tmp11 */
+ subu s3, t5, s0 /* tmp12 */
+ addu t2, t7, s3 /* tmp21 */
+ subu s3, t7, s3 /* tmp24 */
+ addu t7, s1, s0 /* tmp12 */
+ addu v0, s2, t7 /* tmp20 */
+ subu s2, s2, t7 /* tmp25 */
+ subu s1, s1, t5 /* z4 - z1 */
+ subu s1, s1, s0 /* tmp12 */
+ addu s0, t4, s1 /* tmp22 */
+ subu t4, t4, s1 /* tmp23 */
+ /* final output stage */
+ addu t5, v0, t3
+ subu v0, v0, t3
+ addu t3, t2, t1
+ subu t2, t2, t1
+ addu t1, s0, t8
+ subu s0, s0, t8
+ addu t8, t4, t9
+ subu t4, t4, t9
+ addu t9, s3, t0
+ subu s3, s3, t0
+ addu t0, s2, t6
+ subu s2, s2, t6
+ sra t5, t5, 11
+ sra t3, t3, 11
+ sra t1, t1, 11
+ sra t8, t8, 11
+ sra t9, t9, 11
+ sra t0, t0, 11
+ sra s2, s2, 11
+ sra s3, s3, 11
+ sra t4, t4, 11
+ sra s0, s0, 11
+ sra t2, t2, 11
+ sra v0, v0, 11
+ sw t5, 0(a2)
+ sw t3, 32(a2)
+ sw t1, 64(a2)
+ sw t8, 96(a2)
+ sw t9, 128(a2)
+ sw t0, 160(a2)
+ sw s2, 192(a2)
+ sw s3, 224(a2)
+ sw t4, 256(a2)
+ sw s0, 288(a2)
+ sw t2, 320(a2)
+ sw v0, 352(a2)
+ bgtz a3, 1b
+ addiu a2, a2, 4
+
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+ j ra
+ nop
+
+END(jsimd_idct_12x12_pass1_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
+/*
+ * a0 = workspace
+ * a1 = output
+ */
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+ li a3, 12
+
+1:
+ /* Odd part */
+ lw t0, 12(a0)
+ lw t1, 4(a0)
+ lw t2, 20(a0)
+ lw t3, 28(a0)
+ li t4, 10703 /* FIX(1.306562965) */
+ li t5, 4433 /* FIX_0_541196100 */
+ mul t4, t0, t4 /* tmp11 */
+ mul t5, t0, t5 /* -tmp14 */
+ addu t6, t1, t2 /* tmp10 */
+ li t7, 2139 /* FIX(0.261052384) */
+ mul t7, t6, t7 /* MULTIPLY(tmp10, FIX(0.261052384)) */
+ addu t6, t6, t3 /* tmp10 + z4 */
+ li t8, 7053 /* FIX(0.860918669) */
+ mul t6, t6, t8 /* tmp15 */
+ li t8, 2295 /* FIX(0.280143716) */
+ mul t8, t1, t8 /* MULTIPLY(z1, FIX(0.280143716)) */
+ addu t9, t2, t3 /* z3 + z4 */
+ li s0, 8565 /* FIX(1.045510580) */
+ mul t9, t9, s0 /* -tmp13 */
+ li s0, 12112 /* FIX(1.478575242) */
+ mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242)) */
+ li s1, 12998 /* FIX(1.586706681) */
+ mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */
+ li s2, 5540 /* FIX(0.676326758) */
+ mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */
+ li s3, 16244 /* FIX(1.982889723) */
+ mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */
+ subu t1, t1, t3 /* z1 -= z4 */
+ subu t0, t0, t2 /* z2 -= z3 */
+ addu t2, t1, t0 /* z1 + z2 */
+ li t3, 4433 /* FIX_0_541196100 */
+ mul t2, t2, t3 /* z3 */
+ li t3, 6270 /* FIX_0_765366865 */
+ mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */
+ li t3, 15137 /* FIX_1_847759065 */
+ mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */
+ addu t3, t6, t7 /* tmp12 */
+ addu t7, t3, t4
+ addu t7, t7, t8 /* tmp10 */
+ subu t3, t3, t9
+ subu t3, t3, t5
+ subu t3, t3, s0 /* tmp12 */
+ subu t9, t6, t9
+ subu t9, t9, t4
+ addu t9, t9, s1 /* tmp13 */
+ subu t6, t6, t5
+ subu t6, t6, s2
+ subu t6, t6, s3 /* tmp15 */
+ addu t1, t2, t1 /* tmp11 */
+ subu t0, t2, t0 /* tmp14 */
+ /* even part */
+ lw t2, 16(a0) /* z4 */
+ lw t4, 8(a0) /* z1 */
+ lw t5, 0(a0) /* z3 */
+ lw t8, 24(a0) /* z2 */
+ li s0, 10033 /* FIX(1.224744871) */
+ li s1, 11190 /* FIX(1.366025404) */
+ mul t2, t2, s0 /* z4 */
+ mul s0, t4, s1 /* z4 */
+ addiu t5, t5, 0x10
+ sll t5, t5, 13 /* z3 */
+ sll t4, t4, 13 /* z1 */
+ sll t8, t8, 13 /* z2 */
+ subu s1, t4, t8 /* tmp12 */
+ addu s2, t5, t2 /* tmp10 */
+ subu t2, t5, t2 /* tmp11 */
+ addu s3, t5, s1 /* tmp21 */
+ subu s1, t5, s1 /* tmp24 */
+ addu t5, s0, t8 /* tmp12 */
+ addu v0, s2, t5 /* tmp20 */
+ subu t5, s2, t5 /* tmp25 */
+ subu t4, s0, t4
+ subu t4, t4, t8 /* tmp12 */
+ addu t8, t2, t4 /* tmp22 */
+ subu t2, t2, t4 /* tmp23 */
+ /* increment counter and pointers */
+ addiu a3, a3, -1
+ addiu a0, a0, 32
+ /* Final stage */
+ addu t4, v0, t7
+ subu v0, v0, t7
+ addu t7, s3, t1
+ subu s3, s3, t1
+ addu t1, t8, t3
+ subu t8, t8, t3
+ addu t3, t2, t9
+ subu t2, t2, t9
+ addu t9, s1, t0
+ subu s1, s1, t0
+ addu t0, t5, t6
+ subu t5, t5, t6
+ sll t4, t4, 4
+ sll t7, t7, 4
+ sll t1, t1, 4
+ sll t3, t3, 4
+ sll t9, t9, 4
+ sll t0, t0, 4
+ sll t5, t5, 4
+ sll s1, s1, 4
+ sll t2, t2, 4
+ sll t8, t8, 4
+ sll s3, s3, 4
+ sll v0, v0, 4
+ shll_s.w t4, t4, 2
+ shll_s.w t7, t7, 2
+ shll_s.w t1, t1, 2
+ shll_s.w t3, t3, 2
+ shll_s.w t9, t9, 2
+ shll_s.w t0, t0, 2
+ shll_s.w t5, t5, 2
+ shll_s.w s1, s1, 2
+ shll_s.w t2, t2, 2
+ shll_s.w t8, t8, 2
+ shll_s.w s3, s3, 2
+ shll_s.w v0, v0, 2
+ srl t4, t4, 24
+ srl t7, t7, 24
+ srl t1, t1, 24
+ srl t3, t3, 24
+ srl t9, t9, 24
+ srl t0, t0, 24
+ srl t5, t5, 24
+ srl s1, s1, 24
+ srl t2, t2, 24
+ srl t8, t8, 24
+ srl s3, s3, 24
+ srl v0, v0, 24
+ lw t6, 0(a1)
+ addiu t4, t4, 0x80
+ addiu t7, t7, 0x80
+ addiu t1, t1, 0x80
+ addiu t3, t3, 0x80
+ addiu t9, t9, 0x80
+ addiu t0, t0, 0x80
+ addiu t5, t5, 0x80
+ addiu s1, s1, 0x80
+ addiu t2, t2, 0x80
+ addiu t8, t8, 0x80
+ addiu s3, s3, 0x80
+ addiu v0, v0, 0x80
+ sb t4, 0(t6)
+ sb t7, 1(t6)
+ sb t1, 2(t6)
+ sb t3, 3(t6)
+ sb t9, 4(t6)
+ sb t0, 5(t6)
+ sb t5, 6(t6)
+ sb s1, 7(t6)
+ sb t2, 8(t6)
+ sb t8, 9(t6)
+ sb s3, 10(t6)
+ sb v0, 11(t6)
+ bgtz a3, 1b
+ addiu a1, a1, 4
+
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+ jr ra
+ nop
+
+END(jsimd_idct_12x12_pass2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+ lw t0, 0(a0)
+ li t7, 0xff80ff80
+ addu t0, t0, a1
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ lw t0, 4(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 0(a2)
+ usw t4, 4(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 8(a2)
+ usw t6, 12(a2)
+
+ lw t0, 8(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 16(a2)
+ usw t4, 20(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 24(a2)
+ usw t6, 28(a2)
+
+ lw t0, 12(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 32(a2)
+ usw t4, 36(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 40(a2)
+ usw t6, 44(a2)
+
+ lw t0, 16(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 48(a2)
+ usw t4, 52(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 56(a2)
+ usw t6, 60(a2)
+
+ lw t0, 20(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 64(a2)
+ usw t4, 68(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 72(a2)
+ usw t6, 76(a2)
+
+ lw t0, 24(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 80(a2)
+ usw t4, 84(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 88(a2)
+ usw t6, 92(a2)
+
+ lw t0, 28(a0)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu t0, t0, a1
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ ulw t1, 0(t0)
+ ulw t2, 4(t0)
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 96(a2)
+ usw t4, 100(a2)
+ preceu.ph.qbr t3, t1
+ preceu.ph.qbl t4, t1
+ usw t5, 104(a2)
+ usw t6, 108(a2)
+ preceu.ph.qbr t5, t2
+ preceu.ph.qbl t6, t2
+ addu.ph t3, t3, t7
+ addu.ph t4, t4, t7
+ addu.ph t5, t5, t7
+ addu.ph t6, t6, t7
+ usw t3, 112(a2)
+ usw t4, 116(a2)
+ usw t5, 120(a2)
+ usw t6, 124(a2)
+
+ j ra
+ nop
+
+END(jsimd_convsamp_dspr2)
+
+
+#ifndef __mips_soft_float
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_float_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+ .set at
+
+ lw t0, 0(a0)
+ addu t0, t0, a1
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 4(a0)
+ swc1 f2, 0(a2)
+ swc1 f4, 4(a2)
+ swc1 f6, 8(a2)
+ addu t0, t0, a1
+ swc1 f8, 12(a2)
+ swc1 f10, 16(a2)
+ swc1 f12, 20(a2)
+ swc1 f14, 24(a2)
+ swc1 f16, 28(a2)
+ /* elemr 1 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 8(a0)
+ swc1 f2, 32(a2)
+ swc1 f4, 36(a2)
+ swc1 f6, 40(a2)
+ addu t0, t0, a1
+ swc1 f8, 44(a2)
+ swc1 f10, 48(a2)
+ swc1 f12, 52(a2)
+ swc1 f14, 56(a2)
+ swc1 f16, 60(a2)
+ /* elemr 2 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 12(a0)
+ swc1 f2, 64(a2)
+ swc1 f4, 68(a2)
+ swc1 f6, 72(a2)
+ addu t0, t0, a1
+ swc1 f8, 76(a2)
+ swc1 f10, 80(a2)
+ swc1 f12, 84(a2)
+ swc1 f14, 88(a2)
+ swc1 f16, 92(a2)
+ /* elemr 3 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 16(a0)
+ swc1 f2, 96(a2)
+ swc1 f4, 100(a2)
+ swc1 f6, 104(a2)
+ addu t0, t0, a1
+ swc1 f8, 108(a2)
+ swc1 f10, 112(a2)
+ swc1 f12, 116(a2)
+ swc1 f14, 120(a2)
+ swc1 f16, 124(a2)
+ /* elemr 4 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 20(a0)
+ swc1 f2, 128(a2)
+ swc1 f4, 132(a2)
+ swc1 f6, 136(a2)
+ addu t0, t0, a1
+ swc1 f8, 140(a2)
+ swc1 f10, 144(a2)
+ swc1 f12, 148(a2)
+ swc1 f14, 152(a2)
+ swc1 f16, 156(a2)
+ /* elemr 5 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 24(a0)
+ swc1 f2, 160(a2)
+ swc1 f4, 164(a2)
+ swc1 f6, 168(a2)
+ addu t0, t0, a1
+ swc1 f8, 172(a2)
+ swc1 f10, 176(a2)
+ swc1 f12, 180(a2)
+ swc1 f14, 184(a2)
+ swc1 f16, 188(a2)
+ /* elemr 6 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ lw t0, 28(a0)
+ swc1 f2, 192(a2)
+ swc1 f4, 196(a2)
+ swc1 f6, 200(a2)
+ addu t0, t0, a1
+ swc1 f8, 204(a2)
+ swc1 f10, 208(a2)
+ swc1 f12, 212(a2)
+ swc1 f14, 216(a2)
+ swc1 f16, 220(a2)
+ /* elemr 7 */
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f2
+ mtc1 t2, f4
+ mtc1 t3, f6
+ mtc1 t4, f8
+ mtc1 t5, f10
+ mtc1 t6, f12
+ mtc1 t7, f14
+ mtc1 t8, f16
+ cvt.s.w f2, f2
+ cvt.s.w f4, f4
+ cvt.s.w f6, f6
+ cvt.s.w f8, f8
+ cvt.s.w f10, f10
+ cvt.s.w f12, f12
+ cvt.s.w f14, f14
+ cvt.s.w f16, f16
+ swc1 f2, 224(a2)
+ swc1 f4, 228(a2)
+ swc1 f6, 232(a2)
+ swc1 f8, 236(a2)
+ swc1 f10, 240(a2)
+ swc1 f12, 244(a2)
+ swc1 f14, 248(a2)
+ swc1 f16, 252(a2)
+
+ j ra
+ nop
+
+END(jsimd_convsamp_float_dspr2)
+
+#endif
+
+/*****************************************************************************/
diff --git a/media/libjpeg/simd/mips/jsimd_dspr2_asm.h b/media/libjpeg/simd/mips/jsimd_dspr2_asm.h
new file mode 100644
index 0000000000..12cfda486c
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd_dspr2_asm.h
@@ -0,0 +1,292 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * Copyright (C) 2018, Matthieu Darbois.
+ * All Rights Reserved.
+ * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
+ * Darko Laus (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero $0
+#define AT $1
+#define v0 $2
+#define v1 $3
+#define a0 $4
+#define a1 $5
+#define a2 $6
+#define a3 $7
+#define t0 $8
+#define t1 $9
+#define t2 $10
+#define t3 $11
+#define t4 $12
+#define t5 $13
+#define t6 $14
+#define t7 $15
+#define s0 $16
+#define s1 $17
+#define s2 $18
+#define s3 $19
+#define s4 $20
+#define s5 $21
+#define s6 $22
+#define s7 $23
+#define t8 $24
+#define t9 $25
+#define k0 $26
+#define k1 $27
+#define gp $28
+#define sp $29
+#define fp $30
+#define s8 $30
+#define ra $31
+
+#define f0 $f0
+#define f1 $f1
+#define f2 $f2
+#define f3 $f3
+#define f4 $f4
+#define f5 $f5
+#define f6 $f6
+#define f7 $f7
+#define f8 $f8
+#define f9 $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
+#ifdef __ELF__
+#define HIDDEN_SYMBOL(symbol) .hidden symbol;
+#else
+#define HIDDEN_SYMBOL(symbol)
+#endif
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol) \
+ .globl symbol; \
+ HIDDEN_SYMBOL(symbol) \
+ .align 2; \
+ .type symbol, @function; \
+ .ent symbol, 0; \
+symbol: \
+ .frame sp, 0, ra; \
+ .set push; \
+ .set arch = mips32r2; \
+ .set noreorder; \
+ .set noat;
+
+/*
+ * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_DSPR2(symbol) \
+LEAF_MIPS32R2(symbol) \
+ .set dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function) \
+ .set pop; \
+ .end function; \
+ .size function, .-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
+ * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
+ r2 = 0, r3 = 0, r4 = 0, \
+ r5 = 0, r6 = 0, r7 = 0, \
+ r8 = 0, r9 = 0, r10 = 0, \
+ r11 = 0, r12 = 0, r13 = 0, \
+ r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+ .error "Stack offset must be pozitive and multiple of 4."
+.endif
+.if \stack_offset != 0
+ addiu sp, sp, -\stack_offset
+.endif
+ sw \r1, 0(sp)
+.if \r2 != 0
+ sw \r2, 4(sp)
+.endif
+.if \r3 != 0
+ sw \r3, 8(sp)
+.endif
+.if \r4 != 0
+ sw \r4, 12(sp)
+.endif
+.if \r5 != 0
+ CHECK_STACK_OFFSET 5, \stack_offset
+ sw \r5, 16(sp)
+.endif
+.if \r6 != 0
+ CHECK_STACK_OFFSET 6, \stack_offset
+ sw \r6, 20(sp)
+.endif
+.if \r7 != 0
+ CHECK_STACK_OFFSET 7, \stack_offset
+ sw \r7, 24(sp)
+.endif
+.if \r8 != 0
+ CHECK_STACK_OFFSET 8, \stack_offset
+ sw \r8, 28(sp)
+.endif
+.if \r9 != 0
+ CHECK_STACK_OFFSET 9, \stack_offset
+ sw \r9, 32(sp)
+.endif
+.if \r10 != 0
+ CHECK_STACK_OFFSET 10, \stack_offset
+ sw \r10, 36(sp)
+.endif
+.if \r11 != 0
+ CHECK_STACK_OFFSET 11, \stack_offset
+ sw \r11, 40(sp)
+.endif
+.if \r12 != 0
+ CHECK_STACK_OFFSET 12, \stack_offset
+ sw \r12, 44(sp)
+.endif
+.if \r13 != 0
+ CHECK_STACK_OFFSET 13, \stack_offset
+ sw \r13, 48(sp)
+.endif
+.if \r14 != 0
+ CHECK_STACK_OFFSET 14, \stack_offset
+ sw \r14, 52(sp)
+.endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
+ * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
+ r2 = 0, r3 = 0, r4 = 0, \
+ r5 = 0, r6 = 0, r7 = 0, \
+ r8 = 0, r9 = 0, r10 = 0, \
+ r11 = 0, r12 = 0, r13 = 0, \
+ r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+ .error "Stack offset must be pozitive and multiple of 4."
+.endif
+ lw \r1, 0(sp)
+.if \r2 != 0
+ lw \r2, 4(sp)
+.endif
+.if \r3 != 0
+ lw \r3, 8(sp)
+.endif
+.if \r4 != 0
+ lw \r4, 12(sp)
+.endif
+.if \r5 != 0
+ CHECK_STACK_OFFSET 5, \stack_offset
+ lw \r5, 16(sp)
+.endif
+.if \r6 != 0
+ CHECK_STACK_OFFSET 6, \stack_offset
+ lw \r6, 20(sp)
+.endif
+.if \r7 != 0
+ CHECK_STACK_OFFSET 7, \stack_offset
+ lw \r7, 24(sp)
+.endif
+.if \r8 != 0
+ CHECK_STACK_OFFSET 8, \stack_offset
+ lw \r8, 28(sp)
+.endif
+.if \r9 != 0
+ CHECK_STACK_OFFSET 9, \stack_offset
+ lw \r9, 32(sp)
+.endif
+.if \r10 != 0
+ CHECK_STACK_OFFSET 10, \stack_offset
+ lw \r10, 36(sp)
+.endif
+.if \r11 != 0
+ CHECK_STACK_OFFSET 11, \stack_offset
+ lw \r11, 40(sp)
+.endif
+.if \r12 != 0
+ CHECK_STACK_OFFSET 12, \stack_offset
+ lw \r12, 44(sp)
+.endif
+.if \r13 != 0
+ CHECK_STACK_OFFSET 13, \stack_offset
+ lw \r13, 48(sp)
+.endif
+.if \r14 != 0
+ CHECK_STACK_OFFSET 14, \stack_offset
+ lw \r14, 52(sp)
+.endif
+.if \stack_offset != 0
+ addiu sp, sp, \stack_offset
+.endif
+.endm
diff --git a/media/libjpeg/simd/mips64/jccolext-mmi.c b/media/libjpeg/simd/mips64/jccolext-mmi.c
new file mode 100644
index 0000000000..558eb2ab10
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jccolext-mmi.c
@@ -0,0 +1,455 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA re
+#define mmB ro
+#elif RGB_GREEN == 0
+#define mmA ge
+#define mmB go
+#elif RGB_BLUE == 0
+#define mmA be
+#define mmB bo
+#else
+#define mmA xe
+#define mmB xo
+#endif
+
+#if RGB_RED == 1
+#define mmC re
+#define mmD ro
+#elif RGB_GREEN == 1
+#define mmC ge
+#define mmD go
+#elif RGB_BLUE == 1
+#define mmC be
+#define mmD bo
+#else
+#define mmC xe
+#define mmD xo
+#endif
+
+#if RGB_RED == 2
+#define mmE re
+#define mmF ro
+#elif RGB_GREEN == 2
+#define mmE ge
+#define mmF go
+#elif RGB_BLUE == 2
+#define mmE be
+#define mmF bo
+#else
+#define mmE xe
+#define mmF xo
+#endif
+
+#if RGB_RED == 3
+#define mmG re
+#define mmH ro
+#elif RGB_GREEN == 3
+#define mmG ge
+#define mmH go
+#elif RGB_BLUE == 3
+#define mmG be
+#define mmH bo
+#else
+#define mmG xe
+#define mmH xo
+#endif
+
+
+void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ JSAMPROW inptr, outptr0, outptr1, outptr2;
+ int num_cols, col;
+ __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+ __m64 xo;
+#endif
+ __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+ __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
+ __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
+ __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+ __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+ __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
+ __m64 crle, crhe, cre, crlo, crho, cro, cr;
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+ outptr0 += 8, outptr1 += 8, outptr2 += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+ if (num_cols < 8) {
+ col = num_cols * 3;
+ asm(".set noreorder\r\n"
+
+ "li $8, 1\r\n"
+ "move $9, %3\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 1f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 1\r\n"
+ "xor $12, $12, $12\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $9\r\n"
+ "lbu $12, 0($13)\r\n"
+
+ "1: \r\n"
+ "li $8, 2\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 2f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 2\r\n"
+ "xor $11, $11, $11\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $9\r\n"
+ "lhu $11, 0($13)\r\n"
+ "sll $12, $12, 16\r\n"
+ "or $12, $12, $11\r\n"
+
+ "2: \r\n"
+ "dmtc1 $12, %0\r\n"
+ "li $8, 4\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 3f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 4\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $9\r\n"
+ "lwu $14, 0($13)\r\n"
+ "dmtc1 $14, %1\r\n"
+ "dsll32 $12, $12, 0\r\n"
+ "or $12, $12, $14\r\n"
+ "dmtc1 $12, %0\r\n"
+
+ "3: \r\n"
+ "li $8, 8\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 4f\r\n"
+ "nop \r\n"
+ "mov.s %1, %0\r\n"
+ "ldc1 %0, 0(%5)\r\n"
+ "li $9, 8\r\n"
+ "j 5f\r\n"
+ "nop \r\n"
+
+ "4: \r\n"
+ "li $8, 16\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 5f\r\n"
+ "nop \r\n"
+ "mov.s %2, %0\r\n"
+ "ldc1 %0, 0(%5)\r\n"
+ "ldc1 %1, 8(%5)\r\n"
+
+ "5: \r\n"
+ "nop \r\n"
+ ".set reorder\r\n"
+
+ : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+ : "r" (col), "r" (num_rows), "r" (inptr)
+ : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+ "$14", "memory"
+ );
+ } else {
+ if (!(((long)inptr) & 7)) {
+ mmA = _mm_load_si64((__m64 *)&inptr[0]);
+ mmG = _mm_load_si64((__m64 *)&inptr[8]);
+ mmF = _mm_load_si64((__m64 *)&inptr[16]);
+ } else {
+ mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+ mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+ mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+ }
+ inptr += RGB_PIXELSIZE * 8;
+ }
+ mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+ mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+ mmA = _mm_unpackhi_pi8(mmA, mmG);
+ mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+ mmD = _mm_unpacklo_pi8(mmD, mmF);
+ mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+ mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+ mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+ mmA = _mm_unpackhi_pi8(mmA, mmD);
+ mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+ mmE = _mm_unpacklo_pi8(mmE, mmG);
+ mmD = _mm_unpackhi_pi8(mmD, mmG);
+ mmC = _mm_loadhi_pi8_f(mmA);
+ mmA = _mm_loadlo_pi8_f(mmA);
+
+ mmB = _mm_loadhi_pi8_f(mmE);
+ mmE = _mm_loadlo_pi8_f(mmE);
+
+ mmF = _mm_loadhi_pi8_f(mmD);
+ mmD = _mm_loadlo_pi8_f(mmD);
+
+#else /* RGB_PIXELSIZE == 4 */
+
+ if (num_cols < 8) {
+ col = num_cols;
+ asm(".set noreorder\r\n"
+
+ "li $8, 1\r\n"
+ "move $9, %4\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 1f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 1\r\n"
+ PTR_SLL "$11, $9, 2\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $11\r\n"
+ "lwc1 %0, 0($13)\r\n"
+
+ "1: \r\n"
+ "li $8, 2\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 2f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 2\r\n"
+ PTR_SLL "$11, $9, 2\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $11\r\n"
+ "mov.s %1, %0\r\n"
+ "ldc1 %0, 0($13)\r\n"
+
+ "2: \r\n"
+ "li $8, 4\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 3f\r\n"
+ "nop \r\n"
+ "mov.s %2, %0\r\n"
+ "mov.s %3, %1\r\n"
+ "ldc1 %0, 0(%5)\r\n"
+ "ldc1 %1, 8(%5)\r\n"
+
+ "3: \r\n"
+ "nop \r\n"
+ ".set reorder\r\n"
+
+ : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+ : "r" (col), "r" (inptr)
+ : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+ );
+ } else {
+ if (!(((long)inptr) & 7)) {
+ mmA = _mm_load_si64((__m64 *)&inptr[0]);
+ mmF = _mm_load_si64((__m64 *)&inptr[8]);
+ mmD = _mm_load_si64((__m64 *)&inptr[16]);
+ mmC = _mm_load_si64((__m64 *)&inptr[24]);
+ } else {
+ mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+ mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+ mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+ mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+ }
+ inptr += RGB_PIXELSIZE * 8;
+ }
+ mmB = _mm_unpackhi_pi8(mmA, mmF);
+ mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+ mmG = _mm_unpackhi_pi8(mmD, mmC);
+ mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+ mmE = _mm_unpackhi_pi16(mmA, mmD);
+ mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+ mmH = _mm_unpackhi_pi16(mmB, mmG);
+ mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+ mmC = _mm_loadhi_pi8_f(mmA);
+ mmA = _mm_loadlo_pi8_f(mmA);
+
+ mmD = _mm_loadhi_pi8_f(mmB);
+ mmB = _mm_loadlo_pi8_f(mmB);
+
+ mmG = _mm_loadhi_pi8_f(mmE);
+ mmE = _mm_loadlo_pi8_f(mmE);
+
+ mmF = _mm_unpacklo_pi8(mmH, mmH);
+ mmH = _mm_unpackhi_pi8(mmH, mmH);
+ mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+ mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+ /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+ * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+ *
+ * (Original)
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ *
+ * (This implementation)
+ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ */
+
+ rglo = _mm_unpacklo_pi16(ro, go);
+ rgho = _mm_unpackhi_pi16(ro, go);
+ ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+ yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+ cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
+ cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
+
+ blo = _mm_loadlo_pi16_f(bo);
+ bho = _mm_loadhi_pi16_f(bo);
+ halfblo = _mm_srli_pi32(blo, 1);
+ halfbho = _mm_srli_pi32(bho, 1);
+
+ cblo = _mm_add_pi32(cblo, halfblo);
+ cbho = _mm_add_pi32(cbho, halfbho);
+ cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
+ cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
+ cblo = _mm_srli_pi32(cblo, SCALEBITS);
+ cbho = _mm_srli_pi32(cbho, SCALEBITS);
+ cbo = _mm_packs_pi32(cblo, cbho);
+
+ rgle = _mm_unpacklo_pi16(re, ge);
+ rghe = _mm_unpackhi_pi16(re, ge);
+ yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+ yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+ cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
+ cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
+
+ ble = _mm_loadlo_pi16_f(be);
+ bhe = _mm_loadhi_pi16_f(be);
+ halfble = _mm_srli_pi32(ble, 1);
+ halfbhe = _mm_srli_pi32(bhe, 1);
+
+ cble = _mm_add_pi32(cble, halfble);
+ cbhe = _mm_add_pi32(cbhe, halfbhe);
+ cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
+ cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
+ cble = _mm_srli_pi32(cble, SCALEBITS);
+ cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
+ cbe = _mm_packs_pi32(cble, cbhe);
+
+ cbo = _mm_slli_pi16(cbo, BYTE_BIT);
+ cb = _mm_or_si64(cbe, cbo);
+
+ bglo = _mm_unpacklo_pi16(bo, go);
+ bgho = _mm_unpackhi_pi16(bo, go);
+ ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+ yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+ crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
+ crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
+
+ ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+ yho = _mm_add_pi32(yho_bg, yho_rg);
+ ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+ yho = _mm_add_pi32(yho, PD_ONEHALF);
+ ylo = _mm_srli_pi32(ylo, SCALEBITS);
+ yho = _mm_srli_pi32(yho, SCALEBITS);
+ yo = _mm_packs_pi32(ylo, yho);
+
+ rlo = _mm_loadlo_pi16_f(ro);
+ rho = _mm_loadhi_pi16_f(ro);
+ halfrlo = _mm_srli_pi32(rlo, 1);
+ halfrho = _mm_srli_pi32(rho, 1);
+
+ crlo = _mm_add_pi32(crlo, halfrlo);
+ crho = _mm_add_pi32(crho, halfrho);
+ crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
+ crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
+ crlo = _mm_srli_pi32(crlo, SCALEBITS);
+ crho = _mm_srli_pi32(crho, SCALEBITS);
+ cro = _mm_packs_pi32(crlo, crho);
+
+ bgle = _mm_unpacklo_pi16(be, ge);
+ bghe = _mm_unpackhi_pi16(be, ge);
+ yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+ yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+ crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
+ crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
+
+ yle = _mm_add_pi32(yle_bg, yle_rg);
+ yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+ yle = _mm_add_pi32(yle, PD_ONEHALF);
+ yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+ yle = _mm_srli_pi32(yle, SCALEBITS);
+ yhe = _mm_srli_pi32(yhe, SCALEBITS);
+ ye = _mm_packs_pi32(yle, yhe);
+
+ yo = _mm_slli_pi16(yo, BYTE_BIT);
+ y = _mm_or_si64(ye, yo);
+
+ rle = _mm_loadlo_pi16_f(re);
+ rhe = _mm_loadhi_pi16_f(re);
+ halfrle = _mm_srli_pi32(rle, 1);
+ halfrhe = _mm_srli_pi32(rhe, 1);
+
+ crle = _mm_add_pi32(crle, halfrle);
+ crhe = _mm_add_pi32(crhe, halfrhe);
+ crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
+ crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
+ crle = _mm_srli_pi32(crle, SCALEBITS);
+ crhe = _mm_srli_pi32(crhe, SCALEBITS);
+ cre = _mm_packs_pi32(crle, crhe);
+
+ cro = _mm_slli_pi16(cro, BYTE_BIT);
+ cr = _mm_or_si64(cre, cro);
+
+ _mm_store_si64((__m64 *)&outptr0[0], y);
+ _mm_store_si64((__m64 *)&outptr1[0], cb);
+ _mm_store_si64((__m64 *)&outptr2[0], cr);
+ }
+ }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jccolor-mmi.c b/media/libjpeg/simd/mips64/jccolor-mmi.c
new file mode 100644
index 0000000000..93ef5c79f7
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jccolor-mmi.c
@@ -0,0 +1,148 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_081 ((short)5329) /* FIX(0.08131) */
+#define F_0_114 ((short)7471) /* FIX(0.11400) */
+#define F_0_168 ((short)11059) /* FIX(0.16874) */
+#define F_0_250 ((short)16384) /* FIX(0.25000) */
+#define F_0_299 ((short)19595) /* FIX(0.29900) */
+#define F_0_331 ((short)21709) /* FIX(0.33126) */
+#define F_0_418 ((short)27439) /* FIX(0.41869) */
+#define F_0_587 ((short)38470) /* FIX(0.58700) */
+#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+ index_PD_ONEHALF,
+ index_PW_F0299_F0337,
+ index_PW_F0114_F0250,
+ index_PW_MF016_MF033,
+ index_PW_MF008_MF041,
+ index_PD_ONEHALFM1_CJ
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+ _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+ _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114),
+ _uint64_set_pi16(-F_0_331, -F_0_168, -F_0_331, -F_0_168),
+ _uint64_set_pi16(-F_0_418, -F_0_081, -F_0_418, -F_0_081),
+ _uint64_set_pi32(((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)),
+ ((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)))
+};
+
+#define get_const_value(index) (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337 get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250 get_const_value(index_PW_F0114_F0250)
+#define PW_MF016_MF033 get_const_value(index_PW_MF016_MF033)
+#define PW_MF008_MF041 get_const_value(index_PW_MF008_MF041)
+#define PD_ONEHALFM1_CJ get_const_value(index_PD_ONEHALFM1_CJ)
+
+
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extrgbx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extbgrx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extxbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extxrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jcgray-mmi.c b/media/libjpeg/simd/mips64/jcgray-mmi.c
new file mode 100644
index 0000000000..9c7b833f2e
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcgray-mmi.c
@@ -0,0 +1,132 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_114 ((short)7471) /* FIX(0.11400) */
+#define F_0_250 ((short)16384) /* FIX(0.25000) */
+#define F_0_299 ((short)19595) /* FIX(0.29900) */
+#define F_0_587 ((short)38470) /* FIX(0.58700) */
+#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+ index_PD_ONEHALF,
+ index_PW_F0299_F0337,
+ index_PW_F0114_F0250
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+ _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+ _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114)
+};
+
+#define get_const_value(index) (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337 get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250 get_const_value(index_PW_F0114_F0250)
+
+
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi jsimd_extrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi jsimd_extrgbx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi jsimd_extbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi jsimd_extbgrx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi jsimd_extxbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi jsimd_extxrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jcgryext-mmi.c b/media/libjpeg/simd/mips64/jcgryext-mmi.c
new file mode 100644
index 0000000000..08a83d6699
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcgryext-mmi.c
@@ -0,0 +1,374 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA re
+#define mmB ro
+#elif RGB_GREEN == 0
+#define mmA ge
+#define mmB go
+#elif RGB_BLUE == 0
+#define mmA be
+#define mmB bo
+#else
+#define mmA xe
+#define mmB xo
+#endif
+
+#if RGB_RED == 1
+#define mmC re
+#define mmD ro
+#elif RGB_GREEN == 1
+#define mmC ge
+#define mmD go
+#elif RGB_BLUE == 1
+#define mmC be
+#define mmD bo
+#else
+#define mmC xe
+#define mmD xo
+#endif
+
+#if RGB_RED == 2
+#define mmE re
+#define mmF ro
+#elif RGB_GREEN == 2
+#define mmE ge
+#define mmF go
+#elif RGB_BLUE == 2
+#define mmE be
+#define mmF bo
+#else
+#define mmE xe
+#define mmF xo
+#endif
+
+#if RGB_RED == 3
+#define mmG re
+#define mmH ro
+#elif RGB_GREEN == 3
+#define mmG ge
+#define mmH go
+#elif RGB_BLUE == 3
+#define mmG be
+#define mmH bo
+#else
+#define mmG xe
+#define mmH xo
+#endif
+
+
+void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ JSAMPROW inptr, outptr;
+ int num_cols, col;
+ __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+ __m64 xo;
+#endif
+ __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+ __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+ __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr = output_buf[0][output_row];
+ output_row++;
+
+ for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+ outptr += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+ if (num_cols < 8) {
+ col = num_cols * 3;
+ asm(".set noreorder\r\n"
+
+ "li $8, 1\r\n"
+ "move $9, %3\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 1f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 1\r\n"
+ "xor $12, $12, $12\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $9\r\n"
+ "lbu $12, 0($13)\r\n"
+
+ "1: \r\n"
+ "li $8, 2\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 2f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 2\r\n"
+ "xor $11, $11, $11\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $9\r\n"
+ "lhu $11, 0($13)\r\n"
+ "sll $12, $12, 16\r\n"
+ "or $12, $12, $11\r\n"
+
+ "2: \r\n"
+ "dmtc1 $12, %0\r\n"
+ "li $8, 4\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 3f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 4\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $9\r\n"
+ "lwu $14, 0($13)\r\n"
+ "dmtc1 $14, %1\r\n"
+ "dsll32 $12, $12, 0\r\n"
+ "or $12, $12, $14\r\n"
+ "dmtc1 $12, %0\r\n"
+
+ "3: \r\n"
+ "li $8, 8\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 4f\r\n"
+ "nop \r\n"
+ "mov.s %1, %0\r\n"
+ "ldc1 %0, 0(%5)\r\n"
+ "li $9, 8\r\n"
+ "j 5f\r\n"
+ "nop \r\n"
+
+ "4: \r\n"
+ "li $8, 16\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 5f\r\n"
+ "nop \r\n"
+ "mov.s %2, %0\r\n"
+ "ldc1 %0, 0(%5)\r\n"
+ "ldc1 %1, 8(%5)\r\n"
+
+ "5: \r\n"
+ "nop \r\n"
+ ".set reorder\r\n"
+
+ : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+ : "r" (col), "r" (num_rows), "r" (inptr)
+ : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+ "$14", "memory"
+ );
+ } else {
+ if (!(((long)inptr) & 7)) {
+ mmA = _mm_load_si64((__m64 *)&inptr[0]);
+ mmG = _mm_load_si64((__m64 *)&inptr[8]);
+ mmF = _mm_load_si64((__m64 *)&inptr[16]);
+ } else {
+ mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+ mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+ mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+ }
+ inptr += RGB_PIXELSIZE * 8;
+ }
+ mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+ mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+ mmA = _mm_unpackhi_pi8(mmA, mmG);
+ mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+ mmD = _mm_unpacklo_pi8(mmD, mmF);
+ mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+ mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+ mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+ mmA = _mm_unpackhi_pi8(mmA, mmD);
+ mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+ mmE = _mm_unpacklo_pi8(mmE, mmG);
+ mmD = _mm_unpackhi_pi8(mmD, mmG);
+ mmC = _mm_loadhi_pi8_f(mmA);
+ mmA = _mm_loadlo_pi8_f(mmA);
+
+ mmB = _mm_loadhi_pi8_f(mmE);
+ mmE = _mm_loadlo_pi8_f(mmE);
+
+ mmF = _mm_loadhi_pi8_f(mmD);
+ mmD = _mm_loadlo_pi8_f(mmD);
+
+#else /* RGB_PIXELSIZE == 4 */
+
+ if (num_cols < 8) {
+ col = num_cols;
+ asm(".set noreorder\r\n"
+
+ "li $8, 1\r\n"
+ "move $9, %4\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 1f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 1\r\n"
+ PTR_SLL "$11, $9, 2\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $11\r\n"
+ "lwc1 %0, 0($13)\r\n"
+
+ "1: \r\n"
+ "li $8, 2\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 2f\r\n"
+ "nop \r\n"
+ "subu $9, $9, 2\r\n"
+ PTR_SLL "$11, $9, 2\r\n"
+ "move $13, %5\r\n"
+ PTR_ADDU "$13, $13, $11\r\n"
+ "mov.s %1, %0\r\n"
+ "ldc1 %0, 0($13)\r\n"
+
+ "2: \r\n"
+ "li $8, 4\r\n"
+ "and $10, $9, $8\r\n"
+ "beqz $10, 3f\r\n"
+ "nop \r\n"
+ "mov.s %2, %0\r\n"
+ "mov.s %3, %1\r\n"
+ "ldc1 %0, 0(%5)\r\n"
+ "ldc1 %1, 8(%5)\r\n"
+
+ "3: \r\n"
+ "nop \r\n"
+ ".set reorder\r\n"
+
+ : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+ : "r" (col), "r" (inptr)
+ : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+ );
+ } else {
+ if (!(((long)inptr) & 7)) {
+ mmA = _mm_load_si64((__m64 *)&inptr[0]);
+ mmF = _mm_load_si64((__m64 *)&inptr[8]);
+ mmD = _mm_load_si64((__m64 *)&inptr[16]);
+ mmC = _mm_load_si64((__m64 *)&inptr[24]);
+ } else {
+ mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+ mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+ mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+ mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+ }
+ inptr += RGB_PIXELSIZE * 8;
+ }
+ mmB = _mm_unpackhi_pi8(mmA, mmF);
+ mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+ mmG = _mm_unpackhi_pi8(mmD, mmC);
+ mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+ mmE = _mm_unpackhi_pi16(mmA, mmD);
+ mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+ mmH = _mm_unpackhi_pi16(mmB, mmG);
+ mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+ mmC = _mm_loadhi_pi8_f(mmA);
+ mmA = _mm_loadlo_pi8_f(mmA);
+
+ mmD = _mm_loadhi_pi8_f(mmB);
+ mmB = _mm_loadlo_pi8_f(mmB);
+
+ mmG = _mm_loadhi_pi8_f(mmE);
+ mmE = _mm_loadlo_pi8_f(mmE);
+
+ mmF = _mm_unpacklo_pi8(mmH, mmH);
+ mmH = _mm_unpackhi_pi8(mmH, mmH);
+ mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+ mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+ /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+ * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+ *
+ * (Original)
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * (This implementation)
+ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ */
+
+ rglo = _mm_unpacklo_pi16(ro, go);
+ rgho = _mm_unpackhi_pi16(ro, go);
+ ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+ yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+
+ rgle = _mm_unpacklo_pi16(re, ge);
+ rghe = _mm_unpackhi_pi16(re, ge);
+ yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+ yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+
+ bglo = _mm_unpacklo_pi16(bo, go);
+ bgho = _mm_unpackhi_pi16(bo, go);
+ ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+ yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+
+ ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+ yho = _mm_add_pi32(yho_bg, yho_rg);
+ ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+ yho = _mm_add_pi32(yho, PD_ONEHALF);
+ ylo = _mm_srli_pi32(ylo, SCALEBITS);
+ yho = _mm_srli_pi32(yho, SCALEBITS);
+ yo = _mm_packs_pi32(ylo, yho);
+
+ bgle = _mm_unpacklo_pi16(be, ge);
+ bghe = _mm_unpackhi_pi16(be, ge);
+ yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+ yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+
+ yle = _mm_add_pi32(yle_bg, yle_rg);
+ yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+ yle = _mm_add_pi32(yle, PD_ONEHALF);
+ yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+ yle = _mm_srli_pi32(yle, SCALEBITS);
+ yhe = _mm_srli_pi32(yhe, SCALEBITS);
+ ye = _mm_packs_pi32(yle, yhe);
+
+ yo = _mm_slli_pi16(yo, BYTE_BIT);
+ y = _mm_or_si64(ye, yo);
+
+ _mm_store_si64((__m64 *)&outptr[0], y);
+ }
+ }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jcsample-mmi.c b/media/libjpeg/simd/mips64/jcsample-mmi.c
new file mode 100644
index 0000000000..0354dac087
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcsample-mmi.c
@@ -0,0 +1,98 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_mmi.h"
+#include "jcsample.h"
+
+
+void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ int inrow, outrow, outcol;
+ JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+ JSAMPROW inptr0, inptr1, outptr;
+ __m64 bias, mask = 0.0, thisavg, nextavg, avg;
+ __m64 this0o, this0e, this0, this0sum, next0o, next0e, next0, next0sum;
+ __m64 this1o, this1e, this1, this1sum, next1o, next1e, next1, next1sum;
+
+ expand_right_edge(input_data, max_v_samp_factor, image_width,
+ output_cols * 2);
+
+ bias = _mm_set1_pi32((1 << 17) + 1); /* 0x00020001 (32-bit bias pattern) */
+ /* bias={1, 2, 1, 2} (16-bit) */
+ mask = _mm_cmpeq_pi16(mask, mask);
+ mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
+
+ for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+ inrow += 2, outrow++) {
+
+ inptr0 = input_data[inrow];
+ inptr1 = input_data[inrow + 1];
+ outptr = output_data[outrow];
+
+ for (outcol = output_cols; outcol > 0;
+ outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
+
+ this0 = _mm_load_si64((__m64 *)&inptr0[0]);
+ this1 = _mm_load_si64((__m64 *)&inptr1[0]);
+ next0 = _mm_load_si64((__m64 *)&inptr0[8]);
+ next1 = _mm_load_si64((__m64 *)&inptr1[8]);
+
+ this0o = _mm_and_si64(this0, mask);
+ this0e = _mm_srli_pi16(this0, BYTE_BIT);
+ this1o = _mm_and_si64(this1, mask);
+ this1e = _mm_srli_pi16(this1, BYTE_BIT);
+ this0sum = _mm_add_pi16(this0o, this0e);
+ this1sum = _mm_add_pi16(this1o, this1e);
+
+ next0o = _mm_and_si64(next0, mask);
+ next0e = _mm_srli_pi16(next0, BYTE_BIT);
+ next1o = _mm_and_si64(next1, mask);
+ next1e = _mm_srli_pi16(next1, BYTE_BIT);
+ next0sum = _mm_add_pi16(next0o, next0e);
+ next1sum = _mm_add_pi16(next1o, next1e);
+
+ thisavg = _mm_add_pi16(this0sum, this1sum);
+ nextavg = _mm_add_pi16(next0sum, next1sum);
+ thisavg = _mm_add_pi16(thisavg, bias);
+ nextavg = _mm_add_pi16(nextavg, bias);
+ thisavg = _mm_srli_pi16(thisavg, 2);
+ nextavg = _mm_srli_pi16(nextavg, 2);
+
+ avg = _mm_packs_pu16(thisavg, nextavg);
+
+ _mm_store_si64((__m64 *)&outptr[0], avg);
+ }
+ }
+}
diff --git a/media/libjpeg/simd/mips64/jcsample.h b/media/libjpeg/simd/mips64/jcsample.h
new file mode 100644
index 0000000000..bd07fcc4ed
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+ JDIMENSION output_cols)
+{
+ register JSAMPROW ptr;
+ register JSAMPLE pixval;
+ register int count;
+ int row;
+ int numcols = (int)(output_cols - input_cols);
+
+ if (numcols > 0) {
+ for (row = 0; row < num_rows; row++) {
+ ptr = image_data[row] + input_cols;
+ pixval = ptr[-1];
+ for (count = numcols; count > 0; count--)
+ *ptr++ = pixval;
+ }
+ }
+}
diff --git a/media/libjpeg/simd/mips64/jdcolext-mmi.c b/media/libjpeg/simd/mips64/jdcolext-mmi.c
new file mode 100644
index 0000000000..3b5b2f2030
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdcolext-mmi.c
@@ -0,0 +1,415 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA re
+#define mmB ro
+#elif RGB_GREEN == 0
+#define mmA ge
+#define mmB go
+#elif RGB_BLUE == 0
+#define mmA be
+#define mmB bo
+#else
+#define mmA xe
+#define mmB xo
+#endif
+
+#if RGB_RED == 1
+#define mmC re
+#define mmD ro
+#elif RGB_GREEN == 1
+#define mmC ge
+#define mmD go
+#elif RGB_BLUE == 1
+#define mmC be
+#define mmD bo
+#else
+#define mmC xe
+#define mmD xo
+#endif
+
+#if RGB_RED == 2
+#define mmE re
+#define mmF ro
+#elif RGB_GREEN == 2
+#define mmE ge
+#define mmF go
+#elif RGB_BLUE == 2
+#define mmE be
+#define mmF bo
+#else
+#define mmE xe
+#define mmF xo
+#endif
+
+#if RGB_RED == 3
+#define mmG re
+#define mmH ro
+#elif RGB_GREEN == 3
+#define mmG ge
+#define mmH go
+#elif RGB_BLUE == 3
+#define mmG be
+#define mmH bo
+#else
+#define mmG xe
+#define mmH xo
+#endif
+
+
+void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ JSAMPROW outptr, inptr0, inptr1, inptr2;
+ int num_cols, col;
+ __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr;
+ __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0;
+ __m64 decenter, mask;
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+
+ for (num_cols = out_width; num_cols > 0; num_cols -= 8,
+ inptr0 += 8, inptr1 += 8, inptr2 += 8) {
+
+ cb = _mm_load_si64((__m64 *)inptr1);
+ cr = _mm_load_si64((__m64 *)inptr2);
+ y = _mm_load_si64((__m64 *)inptr0);
+
+ mask = decenter = 0.0;
+ mask = _mm_cmpeq_pi16(mask, mask);
+ decenter = _mm_cmpeq_pi16(decenter, decenter);
+ mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
+ decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+ cbe = _mm_and_si64(mask, cb); /* Cb(0246) */
+ cbo = _mm_srli_pi16(cb, BYTE_BIT); /* Cb(1357) */
+ cre = _mm_and_si64(mask, cr); /* Cr(0246) */
+ cro = _mm_srli_pi16(cr, BYTE_BIT); /* Cr(1357) */
+ cbe = _mm_add_pi16(cbe, decenter);
+ cbo = _mm_add_pi16(cbo, decenter);
+ cre = _mm_add_pi16(cre, decenter);
+ cro = _mm_add_pi16(cro, decenter);
+
+ /* (Original)
+ * R = Y + 1.40200 * Cr
+ * G = Y - 0.34414 * Cb - 0.71414 * Cr
+ * B = Y + 1.77200 * Cb
+ *
+ * (This implementation)
+ * R = Y + 0.40200 * Cr + Cr
+ * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ * B = Y - 0.22800 * Cb + Cb + Cb
+ */
+
+ cbe2 = _mm_add_pi16(cbe, cbe); /* 2*CbE */
+ cbo2 = _mm_add_pi16(cbo, cbo); /* 2*CbO */
+ cre2 = _mm_add_pi16(cre, cre); /* 2*CrE */
+ cro2 = _mm_add_pi16(cro, cro); /* 2*CrO */
+
+ be = _mm_mulhi_pi16(cbe2, PW_MF0228); /* (2*CbE * -FIX(0.22800) */
+ bo = _mm_mulhi_pi16(cbo2, PW_MF0228); /* (2*CbO * -FIX(0.22800) */
+ re = _mm_mulhi_pi16(cre2, PW_F0402); /* (2*CrE * FIX(0.40200)) */
+ ro = _mm_mulhi_pi16(cro2, PW_F0402); /* (2*CrO * FIX(0.40200)) */
+
+ be = _mm_add_pi16(be, PW_ONE);
+ bo = _mm_add_pi16(bo, PW_ONE);
+ be = _mm_srai_pi16(be, 1); /* (CbE * -FIX(0.22800)) */
+ bo = _mm_srai_pi16(bo, 1); /* (CbO * -FIX(0.22800)) */
+ re = _mm_add_pi16(re, PW_ONE);
+ ro = _mm_add_pi16(ro, PW_ONE);
+ re = _mm_srai_pi16(re, 1); /* (CrE * FIX(0.40200)) */
+ ro = _mm_srai_pi16(ro, 1); /* (CrO * FIX(0.40200)) */
+
+ be = _mm_add_pi16(be, cbe);
+ bo = _mm_add_pi16(bo, cbo);
+ be = _mm_add_pi16(be, cbe); /* (CbE * FIX(1.77200))=(B-Y)E */
+ bo = _mm_add_pi16(bo, cbo); /* (CbO * FIX(1.77200))=(B-Y)O */
+ re = _mm_add_pi16(re, cre); /* (CrE * FIX(1.40200))=(R-Y)E */
+ ro = _mm_add_pi16(ro, cro); /* (CrO * FIX(1.40200))=(R-Y)O */
+
+ gle = _mm_unpacklo_pi16(cbe, cre);
+ ghe = _mm_unpackhi_pi16(cbe, cre);
+ gle = _mm_madd_pi16(gle, PW_MF0344_F0285);
+ ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285);
+ glo = _mm_unpacklo_pi16(cbo, cro);
+ gho = _mm_unpackhi_pi16(cbo, cro);
+ glo = _mm_madd_pi16(glo, PW_MF0344_F0285);
+ gho = _mm_madd_pi16(gho, PW_MF0344_F0285);
+
+ gle = _mm_add_pi32(gle, PD_ONEHALF);
+ ghe = _mm_add_pi32(ghe, PD_ONEHALF);
+ gle = _mm_srai_pi32(gle, SCALEBITS);
+ ghe = _mm_srai_pi32(ghe, SCALEBITS);
+ glo = _mm_add_pi32(glo, PD_ONEHALF);
+ gho = _mm_add_pi32(gho, PD_ONEHALF);
+ glo = _mm_srai_pi32(glo, SCALEBITS);
+ gho = _mm_srai_pi32(gho, SCALEBITS);
+
+ ge = _mm_packs_pi32(gle, ghe); /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
+ go = _mm_packs_pi32(glo, gho); /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
+ ge = _mm_sub_pi16(ge, cre); /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
+ go = _mm_sub_pi16(go, cro); /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
+
+ ye = _mm_and_si64(mask, y); /* Y(0246) */
+ yo = _mm_srli_pi16(y, BYTE_BIT); /* Y(1357) */
+
+ re = _mm_add_pi16(re, ye); /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
+ ro = _mm_add_pi16(ro, yo); /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
+ re = _mm_packs_pu16(re, re); /* (R0 R2 R4 R6 ** ** ** **) */
+ ro = _mm_packs_pu16(ro, ro); /* (R1 R3 R5 R7 ** ** ** **) */
+
+ ge = _mm_add_pi16(ge, ye); /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
+ go = _mm_add_pi16(go, yo); /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
+ ge = _mm_packs_pu16(ge, ge); /* (G0 G2 G4 G6 ** ** ** **) */
+ go = _mm_packs_pu16(go, go); /* (G1 G3 G5 G7 ** ** ** **) */
+
+ be = _mm_add_pi16(be, ye); /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
+ bo = _mm_add_pi16(bo, yo); /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
+ be = _mm_packs_pu16(be, be); /* (B0 B2 B4 B6 ** ** ** **) */
+ bo = _mm_packs_pu16(bo, bo); /* (B1 B3 B5 B7 ** ** ** **) */
+
+#if RGB_PIXELSIZE == 3
+
+ /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+ /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+ mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
+ mmE = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */
+ mmD = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */
+
+ mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
+
+ mmG = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 05 06 16 26 07) */
+ mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 01 02 12 22 03) */
+
+ mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
+ mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT); /* (13 23 15 25 17 27 -- --) */
+
+ mmC = _mm_unpackhi_pi16(mmD, mmH); /* (15 25 06 16 17 27 -- --) */
+ mmD = _mm_unpacklo_pi16(mmD, mmH); /* (11 21 02 12 13 23 04 14) */
+
+ mmF = _mm_unpackhi_pi16(mmE, mmB); /* (26 07 17 27 -- -- -- --) */
+ mmE = _mm_unpacklo_pi16(mmE, mmB); /* (22 03 13 23 24 05 15 25) */
+
+ mmA = _mm_unpacklo_pi32(mmA, mmD); /* (00 10 20 01 11 21 02 12) */
+ mmE = _mm_unpacklo_pi32(mmE, mmG); /* (22 03 13 23 04 14 24 05) */
+ mmC = _mm_unpacklo_pi32(mmC, mmF); /* (15 25 06 16 26 07 17 27) */
+
+ if (num_cols >= 8) {
+ if (!(((long)outptr) & 7)) {
+ _mm_store_si64((__m64 *)outptr, mmA);
+ _mm_store_si64((__m64 *)(outptr + 8), mmE);
+ _mm_store_si64((__m64 *)(outptr + 16), mmC);
+ } else {
+ _mm_storeu_si64((__m64 *)outptr, mmA);
+ _mm_storeu_si64((__m64 *)(outptr + 8), mmE);
+ _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+ }
+ outptr += RGB_PIXELSIZE * 8;
+ } else {
+ col = num_cols * 3;
+ asm(".set noreorder\r\n"
+
+ "li $8, 16\r\n"
+ "move $9, %4\r\n"
+ "mov.s $f4, %1\r\n"
+ "mov.s $f6, %3\r\n"
+ "move $10, %5\r\n"
+ "bltu $9, $8, 1f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "gssdlc1 $f6, 7+8($10)\r\n"
+ "gssdrc1 $f6, 8($10)\r\n"
+ "mov.s $f4, %2\r\n"
+ "subu $9, $9, 16\r\n"
+ PTR_ADDU "$10, $10, 16\r\n"
+ "b 2f\r\n"
+ "nop \r\n"
+
+ "1: \r\n"
+ "li $8, 8\r\n" /* st8 */
+ "bltu $9, $8, 2f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "mov.s $f4, %3\r\n"
+ "subu $9, $9, 8\r\n"
+ PTR_ADDU "$10, $10, 8\r\n"
+
+ "2: \r\n"
+ "li $8, 4\r\n" /* st4 */
+ "mfc1 $11, $f4\r\n"
+ "bltu $9, $8, 3f\r\n"
+ "nop \r\n"
+ "swl $11, 3($10)\r\n"
+ "swr $11, 0($10)\r\n"
+ "li $8, 32\r\n"
+ "mtc1 $8, $f6\r\n"
+ "dsrl $f4, $f4, $f6\r\n"
+ "mfc1 $11, $f4\r\n"
+ "subu $9, $9, 4\r\n"
+ PTR_ADDU "$10, $10, 4\r\n"
+
+ "3: \r\n"
+ "li $8, 2\r\n" /* st2 */
+ "bltu $9, $8, 4f\r\n"
+ "nop \r\n"
+ "ush $11, 0($10)\r\n"
+ "srl $11, 16\r\n"
+ "subu $9, $9, 2\r\n"
+ PTR_ADDU "$10, $10, 2\r\n"
+
+ "4: \r\n"
+ "li $8, 1\r\n" /* st1 */
+ "bltu $9, $8, 5f\r\n"
+ "nop \r\n"
+ "sb $11, 0($10)\r\n"
+
+ "5: \r\n"
+ "nop \r\n" /* end */
+ : "=m" (*outptr)
+ : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
+ : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
+ );
+ }
+
+#else /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+ xe = _mm_cmpeq_pi8(xe, xe);
+ xo = _mm_cmpeq_pi8(xo, xo);
+#else
+ xe = _mm_xor_si64(xe, xe);
+ xo = _mm_xor_si64(xo, xo);
+#endif
+ /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+ /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+ /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
+ /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
+
+ mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
+ mmE = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */
+ mmB = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */
+ mmF = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */
+
+ mmC = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 34 06 16 26 36) */
+ mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 30 02 12 22 32) */
+ mmG = _mm_unpackhi_pi16(mmB, mmF); /* (05 15 25 35 07 17 27 37) */
+ mmB = _mm_unpacklo_pi16(mmB, mmF); /* (01 11 21 31 03 13 23 33) */
+
+ mmD = _mm_unpackhi_pi32(mmA, mmB); /* (02 12 22 32 03 13 23 33) */
+ mmA = _mm_unpacklo_pi32(mmA, mmB); /* (00 10 20 30 01 11 21 31) */
+ mmH = _mm_unpackhi_pi32(mmC, mmG); /* (06 16 26 36 07 17 27 37) */
+ mmC = _mm_unpacklo_pi32(mmC, mmG); /* (04 14 24 34 05 15 25 35) */
+
+ if (num_cols >= 8) {
+ if (!(((long)outptr) & 7)) {
+ _mm_store_si64((__m64 *)outptr, mmA);
+ _mm_store_si64((__m64 *)(outptr + 8), mmD);
+ _mm_store_si64((__m64 *)(outptr + 16), mmC);
+ _mm_store_si64((__m64 *)(outptr + 24), mmH);
+ } else {
+ _mm_storeu_si64((__m64 *)outptr, mmA);
+ _mm_storeu_si64((__m64 *)(outptr + 8), mmD);
+ _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+ _mm_storeu_si64((__m64 *)(outptr + 24), mmH);
+ }
+ outptr += RGB_PIXELSIZE * 8;
+ } else {
+ col = num_cols;
+ asm(".set noreorder\r\n" /* st16 */
+
+ "li $8, 4\r\n"
+ "move $9, %6\r\n"
+ "move $10, %7\r\n"
+ "mov.s $f4, %2\r\n"
+ "mov.s $f6, %4\r\n"
+ "bltu $9, $8, 1f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "gssdlc1 $f6, 7+8($10)\r\n"
+ "gssdrc1 $f6, 8($10)\r\n"
+ "mov.s $f4, %3\r\n"
+ "mov.s $f6, %5\r\n"
+ "subu $9, $9, 4\r\n"
+ PTR_ADDU "$10, $10, 16\r\n"
+
+ "1: \r\n"
+ "li $8, 2\r\n" /* st8 */
+ "bltu $9, $8, 2f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "mov.s $f4, $f6\r\n"
+ "subu $9, $9, 2\r\n"
+ PTR_ADDU "$10, $10, 8\r\n"
+
+ "2: \r\n"
+ "li $8, 1\r\n" /* st4 */
+ "bltu $9, $8, 3f\r\n"
+ "nop \r\n"
+ "gsswlc1 $f4, 3($10)\r\n"
+ "gsswrc1 $f4, 0($10)\r\n"
+
+ "3: \r\n"
+ "li %1, 0\r\n" /* end */
+ : "=m" (*outptr), "=r" (col)
+ : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
+ "r" (outptr)
+ : "$f4", "$f6", "$8", "$9", "$10", "memory"
+ );
+ }
+
+#endif
+
+ }
+ }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jdcolor-mmi.c b/media/libjpeg/simd/mips64/jdcolor-mmi.c
new file mode 100644
index 0000000000..2c58263dbd
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdcolor-mmi.c
@@ -0,0 +1,139 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344 ((short)22554) /* FIX(0.34414) */
+#define F_0_402 ((short)26345) /* FIX(1.40200) - FIX(1) */
+#define F_0_285 ((short)18734) /* FIX(1) - FIX(0.71414) */
+#define F_0_228 ((short)14942) /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+ index_PW_ONE,
+ index_PW_F0402,
+ index_PW_MF0228,
+ index_PW_MF0344_F0285,
+ index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi16(1, 1, 1, 1),
+ _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+ _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+ _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+ _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE get_const_value(index_PW_ONE)
+#define PW_F0402 get_const_value(index_PW_F0402)
+#define PW_MF0228 get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285 get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF 1
+
+
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgbx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgrx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jdmerge-mmi.c b/media/libjpeg/simd/mips64/jdmerge-mmi.c
new file mode 100644
index 0000000000..0a39bd5680
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdmerge-mmi.c
@@ -0,0 +1,149 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344 ((short)22554) /* FIX(0.34414) */
+#define F_0_402 ((short)26345) /* FIX(1.40200) - FIX(1) */
+#define F_0_285 ((short)18734) /* FIX(1) - FIX(0.71414) */
+#define F_0_228 ((short)14942) /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+ index_PW_ONE,
+ index_PW_F0402,
+ index_PW_MF0228,
+ index_PW_MF0344_F0285,
+ index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi16(1, 1, 1, 1),
+ _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+ _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+ _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+ _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE get_const_value(index_PW_ONE)
+#define PW_F0402 get_const_value(index_PW_F0402)
+#define PW_MF0228 get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285 get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF 1
+
+
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extrgbx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extrgbx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extbgrx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extbgrx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extxbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extxbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi jsimd_h2v1_extxrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi jsimd_h2v2_extxrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
diff --git a/media/libjpeg/simd/mips64/jdmrgext-mmi.c b/media/libjpeg/simd/mips64/jdmrgext-mmi.c
new file mode 100644
index 0000000000..be09ff2a65
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdmrgext-mmi.c
@@ -0,0 +1,615 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA re
+#define mmB ro
+#elif RGB_GREEN == 0
+#define mmA ge
+#define mmB go
+#elif RGB_BLUE == 0
+#define mmA be
+#define mmB bo
+#else
+#define mmA xe
+#define mmB xo
+#endif
+
+#if RGB_RED == 1
+#define mmC re
+#define mmD ro
+#elif RGB_GREEN == 1
+#define mmC ge
+#define mmD go
+#elif RGB_BLUE == 1
+#define mmC be
+#define mmD bo
+#else
+#define mmC xe
+#define mmD xo
+#endif
+
+#if RGB_RED == 2
+#define mmE re
+#define mmF ro
+#elif RGB_GREEN == 2
+#define mmE ge
+#define mmF go
+#elif RGB_BLUE == 2
+#define mmE be
+#define mmF bo
+#else
+#define mmE xe
+#define mmF xo
+#endif
+
+#if RGB_RED == 3
+#define mmG re
+#define mmH ro
+#elif RGB_GREEN == 3
+#define mmG ge
+#define mmH go
+#elif RGB_BLUE == 3
+#define mmG be
+#define mmH bo
+#else
+#define mmG xe
+#define mmH xo
+#endif
+
+
+void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW outptr, inptr0, inptr1, inptr2;
+ int num_cols, col;
+ __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y;
+ __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr;
+ __m64 rle, rlo, rl, rhe, rho, rh, re, ro;
+ __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go;
+ __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0;
+ __m64 decenter, mask, zero = 0.0;
+#if RGB_PIXELSIZE == 4
+ __m64 mm8, mm9;
+#endif
+
+ inptr0 = input_buf[0][in_row_group_ctr];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8,
+ inptr0 += 16, inptr1 += 8, inptr2 += 8) {
+
+ cb = _mm_load_si64((__m64 *)inptr1);
+ cr = _mm_load_si64((__m64 *)inptr2);
+ ythis = _mm_load_si64((__m64 *)inptr0);
+ ynext = _mm_load_si64((__m64 *)inptr0 + 1);
+
+ mask = decenter = 0.0;
+ mask = _mm_cmpeq_pi16(mask, mask);
+ decenter = _mm_cmpeq_pi16(decenter, decenter);
+ mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
+ decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+ cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */
+ cbh = _mm_unpackhi_pi8(cb, zero); /* Cb(4567) */
+ crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */
+ crh = _mm_unpackhi_pi8(cr, zero); /* Cr(4567) */
+ cbl = _mm_add_pi16(cbl, decenter);
+ cbh = _mm_add_pi16(cbh, decenter);
+ crl = _mm_add_pi16(crl, decenter);
+ crh = _mm_add_pi16(crh, decenter);
+
+ /* (Original)
+ * R = Y + 1.40200 * Cr
+ * G = Y - 0.34414 * Cb - 0.71414 * Cr
+ * B = Y + 1.77200 * Cb
+ *
+ * (This implementation)
+ * R = Y + 0.40200 * Cr + Cr
+ * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ * B = Y - 0.22800 * Cb + Cb + Cb
+ */
+
+ cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */
+ cbh2 = _mm_add_pi16(cbh, cbh); /* 2*CbH */
+ crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */
+ crh2 = _mm_add_pi16(crh, crh); /* 2*CrH */
+
+ bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */
+ bh = _mm_mulhi_pi16(cbh2, PW_MF0228); /* (2*CbH * -FIX(0.22800) */
+ rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */
+ rh = _mm_mulhi_pi16(crh2, PW_F0402); /* (2*CrH * FIX(0.40200)) */
+
+ bl = _mm_add_pi16(bl, PW_ONE);
+ bh = _mm_add_pi16(bh, PW_ONE);
+ bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */
+ bh = _mm_srai_pi16(bh, 1); /* (CbH * -FIX(0.22800)) */
+ rl = _mm_add_pi16(rl, PW_ONE);
+ rh = _mm_add_pi16(rh, PW_ONE);
+ rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */
+ rh = _mm_srai_pi16(rh, 1); /* (CrH * FIX(0.40200)) */
+
+ bl = _mm_add_pi16(bl, cbl);
+ bh = _mm_add_pi16(bh, cbh);
+ bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */
+ bh = _mm_add_pi16(bh, cbh); /* (CbH * FIX(1.77200))=(B-Y)H */
+ rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */
+ rh = _mm_add_pi16(rh, crh); /* (CrH * FIX(1.40200))=(R-Y)H */
+
+ ga = _mm_unpacklo_pi16(cbl, crl);
+ gb = _mm_unpackhi_pi16(cbl, crl);
+ ga = _mm_madd_pi16(ga, PW_MF0344_F0285);
+ gb = _mm_madd_pi16(gb, PW_MF0344_F0285);
+ gc = _mm_unpacklo_pi16(cbh, crh);
+ gd = _mm_unpackhi_pi16(cbh, crh);
+ gc = _mm_madd_pi16(gc, PW_MF0344_F0285);
+ gd = _mm_madd_pi16(gd, PW_MF0344_F0285);
+
+ ga = _mm_add_pi32(ga, PD_ONEHALF);
+ gb = _mm_add_pi32(gb, PD_ONEHALF);
+ ga = _mm_srai_pi32(ga, SCALEBITS);
+ gb = _mm_srai_pi32(gb, SCALEBITS);
+ gc = _mm_add_pi32(gc, PD_ONEHALF);
+ gd = _mm_add_pi32(gd, PD_ONEHALF);
+ gc = _mm_srai_pi32(gc, SCALEBITS);
+ gd = _mm_srai_pi32(gd, SCALEBITS);
+
+ gl = _mm_packs_pi32(ga, gb); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+ gh = _mm_packs_pi32(gc, gd); /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
+ gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+ gh = _mm_sub_pi16(gh, crh); /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
+
+ ythise = _mm_and_si64(mask, ythis); /* Y(0246) */
+ ythiso = _mm_srli_pi16(ythis, BYTE_BIT); /* Y(1357) */
+ ynexte = _mm_and_si64(mask, ynext); /* Y(8ACE) */
+ ynexto = _mm_srli_pi16(ynext, BYTE_BIT); /* Y(9BDF) */
+
+ rle = _mm_add_pi16(rl, ythise); /* (R0 R2 R4 R6) */
+ rlo = _mm_add_pi16(rl, ythiso); /* (R1 R3 R5 R7) */
+ rhe = _mm_add_pi16(rh, ynexte); /* (R8 RA RC RE) */
+ rho = _mm_add_pi16(rh, ynexto); /* (R9 RB RD RF) */
+ re = _mm_packs_pu16(rle, rhe); /* (R0 R2 R4 R6 R8 RA RC RE) */
+ ro = _mm_packs_pu16(rlo, rho); /* (R1 R3 R5 R7 R9 RB RD RF) */
+
+ gle = _mm_add_pi16(gl, ythise); /* (G0 G2 G4 G6) */
+ glo = _mm_add_pi16(gl, ythiso); /* (G1 G3 G5 G7) */
+ ghe = _mm_add_pi16(gh, ynexte); /* (G8 GA GC GE) */
+ gho = _mm_add_pi16(gh, ynexto); /* (G9 GB GD GF) */
+ ge = _mm_packs_pu16(gle, ghe); /* (G0 G2 G4 G6 G8 GA GC GE) */
+ go = _mm_packs_pu16(glo, gho); /* (G1 G3 G5 G7 G9 GB GD GF) */
+
+ ble = _mm_add_pi16(bl, ythise); /* (B0 B2 B4 B6) */
+ blo = _mm_add_pi16(bl, ythiso); /* (B1 B3 B5 B7) */
+ bhe = _mm_add_pi16(bh, ynexte); /* (B8 BA BC BE) */
+ bho = _mm_add_pi16(bh, ynexto); /* (B9 BB BD BF) */
+ be = _mm_packs_pu16(ble, bhe); /* (B0 B2 B4 B6 B8 BA BC BE) */
+ bo = _mm_packs_pu16(blo, bho); /* (B1 B3 B5 B7 B9 BB BD BF) */
+
+#if RGB_PIXELSIZE == 3
+
+ /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+ /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+ /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+ mmG = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
+ mmA = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */
+ mmH = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */
+ mmE = _mm_unpackhi_pi8(mmE, mmB); /* (28 09 2A 0B 2C 0D 2E 0F) */
+ mmC = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */
+ mmD = _mm_unpackhi_pi8(mmD, mmF); /* (19 29 1B 2B 1D 2D 1F 2F) */
+
+ mmB = _mm_unpacklo_pi16(mmG, mmA); /* (00 10 08 18 02 12 0A 1A) */
+ mmA = _mm_unpackhi_pi16(mmG, mmA); /* (04 14 0C 1C 06 16 0E 1E) */
+ mmF = _mm_unpacklo_pi16(mmH, mmE); /* (20 01 28 09 22 03 2A 0B) */
+ mmE = _mm_unpackhi_pi16(mmH, mmE); /* (24 05 2C 0D 26 07 2E 0F) */
+ mmH = _mm_unpacklo_pi16(mmC, mmD); /* (11 21 19 29 13 23 1B 2B) */
+ mmG = _mm_unpackhi_pi16(mmC, mmD); /* (15 25 1D 2D 17 27 1F 2F) */
+
+ mmC = _mm_unpacklo_pi16(mmB, mmF); /* (00 10 20 01 08 18 28 09) */
+ mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
+ mmB = _mm_unpacklo_pi16(mmH, mmB); /* (11 21 02 12 19 29 0A 1A) */
+ mmD = _mm_unpackhi_pi16(mmF, mmH); /* (22 03 13 23 2A 0B 1B 2B) */
+ mmF = _mm_unpacklo_pi16(mmA, mmE); /* (04 14 24 05 0C 1C 2C 0D) */
+ mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+ mmH = _mm_unpacklo_pi16(mmG, mmA); /* (15 25 06 16 1D 2D 0E 1E) */
+ mmG = _mm_unpackhi_pi16(mmE, mmG); /* (26 07 17 27 2E 0F 1F 2F) */
+
+ mmA = _mm_unpacklo_pi32(mmC, mmB); /* (00 10 20 01 11 21 02 12) */
+ mmE = _mm_unpackhi_pi32(mmC, mmB); /* (08 18 28 09 19 29 0A 1A) */
+ mmB = _mm_unpacklo_pi32(mmD, mmF); /* (22 03 13 23 04 14 24 05) */
+ mmF = _mm_unpackhi_pi32(mmD, mmF); /* (2A 0B 1B 2B 0C 1C 2C 0D) */
+ mmC = _mm_unpacklo_pi32(mmH, mmG); /* (15 25 06 16 26 07 17 27) */
+ mmG = _mm_unpackhi_pi32(mmH, mmG); /* (1D 2D 0E 1E 2E 0F 1F 2F) */
+
+ if (num_cols >= 8) {
+ if (!(((long)outptr) & 7)) {
+ _mm_store_si64((__m64 *)outptr, mmA);
+ _mm_store_si64((__m64 *)(outptr + 8), mmB);
+ _mm_store_si64((__m64 *)(outptr + 16), mmC);
+ _mm_store_si64((__m64 *)(outptr + 24), mmE);
+ _mm_store_si64((__m64 *)(outptr + 32), mmF);
+ _mm_store_si64((__m64 *)(outptr + 40), mmG);
+ } else {
+ _mm_storeu_si64((__m64 *)outptr, mmA);
+ _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+ _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+ _mm_storeu_si64((__m64 *)(outptr + 24), mmE);
+ _mm_storeu_si64((__m64 *)(outptr + 32), mmF);
+ _mm_storeu_si64((__m64 *)(outptr + 40), mmG);
+ }
+ outptr += RGB_PIXELSIZE * 16;
+ } else {
+ if (output_width & 1)
+ col = num_cols * 6 + 3;
+ else
+ col = num_cols * 6;
+
+ asm(".set noreorder\r\n" /* st24 */
+
+ "li $8, 24\r\n"
+ "move $9, %7\r\n"
+ "mov.s $f4, %1\r\n"
+ "mov.s $f6, %2\r\n"
+ "mov.s $f8, %3\r\n"
+ "move $10, %8\r\n"
+ "bltu $9, $8, 1f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "gssdlc1 $f6, 7+8($10)\r\n"
+ "gssdrc1 $f6, 8($10)\r\n"
+ "gssdlc1 $f8, 7+16($10)\r\n"
+ "gssdrc1 $f8, 16($10)\r\n"
+ "mov.s $f4, %4\r\n"
+ "mov.s $f6, %5\r\n"
+ "mov.s $f8, %6\r\n"
+ "subu $9, $9, 24\r\n"
+ PTR_ADDU "$10, $10, 24\r\n"
+
+ "1: \r\n"
+ "li $8, 16\r\n" /* st16 */
+ "bltu $9, $8, 2f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "gssdlc1 $f6, 7+8($10)\r\n"
+ "gssdrc1 $f6, 8($10)\r\n"
+ "mov.s $f4, $f8\r\n"
+ "subu $9, $9, 16\r\n"
+ PTR_ADDU "$10, $10, 16\r\n"
+
+ "2: \r\n"
+ "li $8, 8\r\n" /* st8 */
+ "bltu $9, $8, 3f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "mov.s $f4, $f6\r\n"
+ "subu $9, $9, 8\r\n"
+ PTR_ADDU "$10, $10, 8\r\n"
+
+ "3: \r\n"
+ "li $8, 4\r\n" /* st4 */
+ "mfc1 $11, $f4\r\n"
+ "bltu $9, $8, 4f\r\n"
+ "nop \r\n"
+ "swl $11, 3($10)\r\n"
+ "swr $11, 0($10)\r\n"
+ "li $8, 32\r\n"
+ "mtc1 $8, $f6\r\n"
+ "dsrl $f4, $f4, $f6\r\n"
+ "mfc1 $11, $f4\r\n"
+ "subu $9, $9, 4\r\n"
+ PTR_ADDU "$10, $10, 4\r\n"
+
+ "4: \r\n"
+ "li $8, 2\r\n" /* st2 */
+ "bltu $9, $8, 5f\r\n"
+ "nop \r\n"
+ "ush $11, 0($10)\r\n"
+ "srl $11, 16\r\n"
+ "subu $9, $9, 2\r\n"
+ PTR_ADDU "$10, $10, 2\r\n"
+
+ "5: \r\n"
+ "li $8, 1\r\n" /* st1 */
+ "bltu $9, $8, 6f\r\n"
+ "nop \r\n"
+ "sb $11, 0($10)\r\n"
+
+ "6: \r\n"
+ "nop \r\n" /* end */
+ : "=m" (*outptr)
+ : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF),
+ "f" (mmG), "r" (col), "r" (outptr)
+ : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory"
+ );
+ }
+
+#else /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+ xe = _mm_cmpeq_pi8(xe, xe);
+ xo = _mm_cmpeq_pi8(xo, xo);
+#else
+ xe = _mm_xor_si64(xe, xe);
+ xo = _mm_xor_si64(xo, xo);
+#endif
+ /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+ /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+ /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+ /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
+
+ mm8 = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
+ mm9 = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */
+ mmA = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */
+ mmE = _mm_unpackhi_pi8(mmE, mmG); /* (28 38 2A 3A 2C 3C 2E 3E) */
+
+ mmG = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */
+ mmB = _mm_unpackhi_pi8(mmB, mmD); /* (09 19 0B 1B 0D 1D 0F 1F) */
+ mmD = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */
+ mmF = _mm_unpackhi_pi8(mmF, mmH); /* (29 39 2B 3B 2D 3D 2F 3F) */
+
+ mmH = _mm_unpacklo_pi16(mm8, mmA); /* (00 10 20 30 02 12 22 32) */
+ mm8 = _mm_unpackhi_pi16(mm8, mmA); /* (04 14 24 34 06 16 26 36) */
+ mmA = _mm_unpacklo_pi16(mmG, mmD); /* (01 11 21 31 03 13 23 33) */
+ mmD = _mm_unpackhi_pi16(mmG, mmD); /* (05 15 25 35 07 17 27 37) */
+
+ mmG = _mm_unpackhi_pi16(mm9, mmE); /* (0C 1C 2C 3C 0E 1E 2E 3E) */
+ mm9 = _mm_unpacklo_pi16(mm9, mmE); /* (08 18 28 38 0A 1A 2A 3A) */
+ mmE = _mm_unpacklo_pi16(mmB, mmF); /* (09 19 29 39 0B 1B 2B 3B) */
+ mmF = _mm_unpackhi_pi16(mmB, mmF); /* (0D 1D 2D 3D 0F 1F 2F 3F) */
+
+ mmB = _mm_unpackhi_pi32(mmH, mmA); /* (02 12 22 32 03 13 23 33) */
+ mmA = _mm_unpacklo_pi32(mmH, mmA); /* (00 10 20 30 01 11 21 31) */
+ mmC = _mm_unpacklo_pi32(mm8, mmD); /* (04 14 24 34 05 15 25 35) */
+ mmD = _mm_unpackhi_pi32(mm8, mmD); /* (06 16 26 36 07 17 27 37) */
+
+ mmH = _mm_unpackhi_pi32(mmG, mmF); /* (0E 1E 2E 3E 0F 1F 2F 3F) */
+ mmG = _mm_unpacklo_pi32(mmG, mmF); /* (0C 1C 2C 3C 0D 1D 2D 3D) */
+ mmF = _mm_unpackhi_pi32(mm9, mmE); /* (0A 1A 2A 3A 0B 1B 2B 3B) */
+ mmE = _mm_unpacklo_pi32(mm9, mmE); /* (08 18 28 38 09 19 29 39) */
+
+ if (num_cols >= 8) {
+ if (!(((long)outptr) & 7)) {
+ _mm_store_si64((__m64 *)outptr, mmA);
+ _mm_store_si64((__m64 *)(outptr + 8), mmB);
+ _mm_store_si64((__m64 *)(outptr + 16), mmC);
+ _mm_store_si64((__m64 *)(outptr + 24), mmD);
+ _mm_store_si64((__m64 *)(outptr + 32), mmE);
+ _mm_store_si64((__m64 *)(outptr + 40), mmF);
+ _mm_store_si64((__m64 *)(outptr + 48), mmG);
+ _mm_store_si64((__m64 *)(outptr + 56), mmH);
+ } else {
+ _mm_storeu_si64((__m64 *)outptr, mmA);
+ _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+ _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+ _mm_storeu_si64((__m64 *)(outptr + 24), mmD);
+ _mm_storeu_si64((__m64 *)(outptr + 32), mmE);
+ _mm_storeu_si64((__m64 *)(outptr + 40), mmF);
+ _mm_storeu_si64((__m64 *)(outptr + 48), mmG);
+ _mm_storeu_si64((__m64 *)(outptr + 56), mmH);
+ }
+ outptr += RGB_PIXELSIZE * 16;
+ } else {
+ if (output_width & 1)
+ col = num_cols * 2 + 1;
+ else
+ col = num_cols * 2;
+ asm(".set noreorder\r\n" /* st32 */
+
+ "li $8, 8\r\n"
+ "move $9, %10\r\n"
+ "move $10, %11\r\n"
+ "mov.s $f4, %2\r\n"
+ "mov.s $f6, %3\r\n"
+ "mov.s $f8, %4\r\n"
+ "mov.s $f10, %5\r\n"
+ "bltu $9, $8, 1f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "gssdlc1 $f6, 7+8($10)\r\n"
+ "gssdrc1 $f6, 8($10)\r\n"
+ "gssdlc1 $f8, 7+16($10)\r\n"
+ "gssdrc1 $f8, 16($10)\r\n"
+ "gssdlc1 $f10, 7+24($10)\r\n"
+ "gssdrc1 $f10, 24($10)\r\n"
+ "mov.s $f4, %6\r\n"
+ "mov.s $f6, %7\r\n"
+ "mov.s $f8, %8\r\n"
+ "mov.s $f10, %9\r\n"
+ "subu $9, $9, 8\r\n"
+ PTR_ADDU "$10, $10, 32\r\n"
+
+ "1: \r\n"
+ "li $8, 4\r\n" /* st16 */
+ "bltu $9, $8, 2f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "gssdlc1 $f6, 7+8($10)\r\n"
+ "gssdrc1 $f6, 8($10)\r\n"
+ "mov.s $f4, $f8\r\n"
+ "mov.s $f6, $f10\r\n"
+ "subu $9, $9, 4\r\n"
+ PTR_ADDU "$10, $10, 16\r\n"
+
+ "2: \r\n"
+ "li $8, 2\r\n" /* st8 */
+ "bltu $9, $8, 3f\r\n"
+ "nop \r\n"
+ "gssdlc1 $f4, 7($10)\r\n"
+ "gssdrc1 $f4, 0($10)\r\n"
+ "mov.s $f4, $f6\r\n"
+ "subu $9, $9, 2\r\n"
+ PTR_ADDU "$10, $10, 8\r\n"
+
+ "3: \r\n"
+ "li $8, 1\r\n" /* st4 */
+ "bltu $9, $8, 4f\r\n"
+ "nop \r\n"
+ "gsswlc1 $f4, 3($10)\r\n"
+ "gsswrc1 $f4, 0($10)\r\n"
+
+ "4: \r\n"
+ "li %1, 0\r\n" /* end */
+ : "=m" (*outptr), "=r" (col)
+ : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF),
+ "f" (mmG), "f" (mmH), "r" (col), "r" (outptr)
+ : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory"
+ );
+ }
+
+#endif
+
+ }
+
+ if (!((output_width >> 1) & 7)) {
+ if (output_width & 1) {
+ cb = _mm_load_si64((__m64 *)inptr1);
+ cr = _mm_load_si64((__m64 *)inptr2);
+ y = _mm_load_si64((__m64 *)inptr0);
+
+ decenter = 0.0;
+ decenter = _mm_cmpeq_pi16(decenter, decenter);
+ decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+ cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */
+ crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */
+ cbl = _mm_add_pi16(cbl, decenter);
+ crl = _mm_add_pi16(crl, decenter);
+
+ cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */
+ crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */
+ bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */
+ rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */
+
+ bl = _mm_add_pi16(bl, PW_ONE);
+ bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */
+ rl = _mm_add_pi16(rl, PW_ONE);
+ rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */
+
+ bl = _mm_add_pi16(bl, cbl);
+ bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */
+ rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */
+
+ gl = _mm_unpacklo_pi16(cbl, crl);
+ gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
+ gl = _mm_add_pi32(gl, PD_ONEHALF);
+ gl = _mm_srai_pi32(gl, SCALEBITS);
+ gl = _mm_packs_pi32(gl, zero); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+ gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+
+ yl = _mm_unpacklo_pi8(y, zero); /* Y(0123) */
+ rl = _mm_add_pi16(rl, yl); /* (R0 R1 R2 R3) */
+ gl = _mm_add_pi16(gl, yl); /* (G0 G1 G2 G3) */
+ bl = _mm_add_pi16(bl, yl); /* (B0 B1 B2 B3) */
+ re = _mm_packs_pu16(rl, rl);
+ ge = _mm_packs_pu16(gl, gl);
+ be = _mm_packs_pu16(bl, bl);
+#if RGB_PIXELSIZE == 3
+ mmA = _mm_unpacklo_pi8(mmA, mmC);
+ mmA = _mm_unpacklo_pi16(mmA, mmE);
+ asm(".set noreorder\r\n"
+
+ "move $8, %2\r\n"
+ "mov.s $f4, %1\r\n"
+ "mfc1 $9, $f4\r\n"
+ "ush $9, 0($8)\r\n"
+ "srl $9, 16\r\n"
+ "sb $9, 2($8)\r\n"
+ : "=m" (*outptr)
+ : "f" (mmA), "r" (outptr)
+ : "$f4", "$8", "$9", "memory"
+ );
+#else /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+ xe = _mm_cmpeq_pi8(xe, xe);
+#else
+ xe = _mm_xor_si64(xe, xe);
+#endif
+ mmA = _mm_unpacklo_pi8(mmA, mmC);
+ mmE = _mm_unpacklo_pi8(mmE, mmG);
+ mmA = _mm_unpacklo_pi16(mmA, mmE);
+ asm(".set noreorder\r\n"
+
+ "move $8, %2\r\n"
+ "mov.s $f4, %1\r\n"
+ "gsswlc1 $f4, 3($8)\r\n"
+ "gsswrc1 $f4, 0($8)\r\n"
+ : "=m" (*outptr)
+ : "f" (mmA), "r" (outptr)
+ : "$f4", "$8", "memory"
+ );
+#endif
+ }
+ }
+}
+
+
+void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW inptr, outptr;
+
+ inptr = input_buf[0][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+ jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+ output_buf);
+
+ input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+ output_buf[0] = output_buf[1];
+ jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+ output_buf);
+
+ input_buf[0][in_row_group_ctr] = inptr;
+ output_buf[0] = outptr;
+}
+
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jdsample-mmi.c b/media/libjpeg/simd/mips64/jdsample-mmi.c
new file mode 100644
index 0000000000..8ae94e7dcf
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdsample-mmi.c
@@ -0,0 +1,304 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_mmi.h"
+
+
+enum const_index {
+ index_PW_ONE,
+ index_PW_TWO,
+ index_PW_THREE,
+ index_PW_SEVEN,
+ index_PW_EIGHT,
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi16(1, 1, 1, 1),
+ _uint64_set_pi16(2, 2, 2, 2),
+ _uint64_set_pi16(3, 3, 3, 3),
+ _uint64_set_pi16(7, 7, 7, 7),
+ _uint64_set_pi16(8, 8, 8, 8),
+};
+
+#define PW_ONE get_const_value(index_PW_ONE)
+#define PW_TWO get_const_value(index_PW_TWO)
+#define PW_THREE get_const_value(index_PW_THREE)
+#define PW_SEVEN get_const_value(index_PW_SEVEN)
+#define PW_EIGHT get_const_value(index_PW_EIGHT)
+
+
+#define PROCESS_ROW(row, wkoffset, bias1, bias2, shift) { \
+ __m64 samp123X, samp3XXX, samp1234, sampX012, samp_1012; \
+ __m64 sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \
+ __m64 outle, outhe, outlo, outho, outl, outh; \
+ \
+ samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT); /* ( 1 2 3 -) */ \
+ sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 4) */ \
+ samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( 3 - - -) */ \
+ sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT); /* ( - 4 5 6) */ \
+ \
+ samp1234 = _mm_or_si64(samp123X, sampXXX4); /* ( 1 2 3 4) */ \
+ samp3456 = _mm_or_si64(samp3XXX, sampX456); /* ( 3 4 5 6) */ \
+ \
+ sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT); /* ( - 0 1 2) */ \
+ samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT); /* ( 5 6 7 -) */ \
+ samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( 7 - - -) */ \
+ \
+ samp_1012 = _mm_or_si64(sampX012, wk[row]); /* (-1 0 1 2) */ \
+ samp5678 = _mm_or_si64(samp567X, wk[row + wkoffset]); /* ( 5 6 7 8) */ \
+ \
+ wk[row] = samp7XXX; \
+ \
+ samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \
+ samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \
+ samp_1012 = _mm_add_pi16(samp_1012, bias1); \
+ samp3456 = _mm_add_pi16(samp3456, bias1); \
+ samp1234 = _mm_add_pi16(samp1234, bias2); \
+ samp5678 = _mm_add_pi16(samp5678, bias2); \
+ \
+ outle = _mm_add_pi16(samp_1012, samp0123); \
+ outhe = _mm_add_pi16(samp3456, samp4567); \
+ outle = _mm_srli_pi16(outle, shift); /* ( 0 2 4 6) */ \
+ outhe = _mm_srli_pi16(outhe, shift); /* ( 8 10 12 14) */ \
+ outlo = _mm_add_pi16(samp1234, samp0123); \
+ outho = _mm_add_pi16(samp5678, samp4567); \
+ outlo = _mm_srli_pi16(outlo, shift); /* ( 1 3 5 7) */ \
+ outho = _mm_srli_pi16(outho, shift); /* ( 9 11 13 15) */ \
+ \
+ outlo = _mm_slli_pi16(outlo, BYTE_BIT); \
+ outho = _mm_slli_pi16(outho, BYTE_BIT); \
+ outl = _mm_or_si64(outle, outlo); /* ( 0 1 2 3 4 5 6 7) */ \
+ outh = _mm_or_si64(outhe, outho); /* ( 8 9 10 11 12 13 14 15) */ \
+ \
+ _mm_store_si64((__m64 *)outptr##row, outl); \
+ _mm_store_si64((__m64 *)outptr##row + 1, outh); \
+}
+
+void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+ int inrow, outrow, incol, tmp, tmp1;
+ __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h;
+ __m64 this0l, this0h, this0;
+ __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h;
+ __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h;
+ __m64 next0l, next0h, next0;
+ __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h;
+ __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[4], zero = 0.0;
+
+ mask0 = _mm_cmpeq_pi8(mask0, mask0);
+ masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+ mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+ inptr_1 = input_data[inrow - 1];
+ inptr0 = input_data[inrow];
+ inptr1 = input_data[inrow + 1];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ if (downsampled_width & 7) {
+ tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+ tmp1 = downsampled_width * sizeof(JSAMPLE);
+ asm(PTR_ADDU "$8, %3, %6\r\n"
+ "lb $9, ($8)\r\n"
+ PTR_ADDU "$8, %3, %7\r\n"
+ "sb $9, ($8)\r\n"
+ PTR_ADDU "$8, %4, %6\r\n"
+ "lb $9, ($8)\r\n"
+ PTR_ADDU "$8, %4, %7\r\n"
+ "sb $9, ($8)\r\n"
+ PTR_ADDU "$8, %5, %6\r\n"
+ "lb $9, ($8)\r\n"
+ PTR_ADDU "$8, %5, %7\r\n"
+ "sb $9, ($8)\r\n"
+ : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
+ : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
+ : "$8", "$9"
+ );
+ }
+
+ /* process the first column block */
+ this0 = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */
+ this_1 = _mm_load_si64((__m64 *)inptr_1); /* row[-1][0] */
+ this1 = _mm_load_si64((__m64 *)inptr1); /* row[ 1][0] */
+
+ this0l = _mm_unpacklo_pi8(this0, zero); /* row[ 0][0]( 0 1 2 3) */
+ this0h = _mm_unpackhi_pi8(this0, zero); /* row[ 0][0]( 4 5 6 7) */
+ this_1l = _mm_unpacklo_pi8(this_1, zero); /* row[-1][0]( 0 1 2 3) */
+ this_1h = _mm_unpackhi_pi8(this_1, zero); /* row[-1][0]( 4 5 6 7) */
+ this1l = _mm_unpacklo_pi8(this1, zero); /* row[+1][0]( 0 1 2 3) */
+ this1h = _mm_unpackhi_pi8(this1, zero); /* row[+1][0]( 4 5 6 7) */
+
+ this0l = _mm_mullo_pi16(this0l, PW_THREE);
+ this0h = _mm_mullo_pi16(this0h, PW_THREE);
+
+ thiscolsum_1l = _mm_add_pi16(this_1l, this0l); /* ( 0 1 2 3) */
+ thiscolsum_1h = _mm_add_pi16(this_1h, this0h); /* ( 4 5 6 7) */
+ thiscolsum1l = _mm_add_pi16(this0l, this1l); /* ( 0 1 2 3) */
+ thiscolsum1h = _mm_add_pi16(this0h, this1h); /* ( 4 5 6 7) */
+
+ /* temporarily save the intermediate data */
+ _mm_store_si64((__m64 *)outptr0, thiscolsum_1l);
+ _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h);
+ _mm_store_si64((__m64 *)outptr1, thiscolsum1l);
+ _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h);
+
+ wk[0] = _mm_and_si64(thiscolsum_1l, mask0); /* ( 0 - - -) */
+ wk[1] = _mm_and_si64(thiscolsum1l, mask0); /* ( 0 - - -) */
+
+ for (incol = downsampled_width; incol > 0;
+ incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
+ outptr0 += 16, outptr1 += 16) {
+
+ if (incol > 8) {
+ /* process the next column block */
+ next0 = _mm_load_si64((__m64 *)inptr0 + 1); /* row[ 0][1] */
+ next_1 = _mm_load_si64((__m64 *)inptr_1 + 1); /* row[-1][1] */
+ next1 = _mm_load_si64((__m64 *)inptr1 + 1); /* row[+1][1] */
+
+ next0l = _mm_unpacklo_pi8(next0, zero); /* row[ 0][1]( 0 1 2 3) */
+ next0h = _mm_unpackhi_pi8(next0, zero); /* row[ 0][1]( 4 5 6 7) */
+ next_1l = _mm_unpacklo_pi8(next_1, zero); /* row[-1][1]( 0 1 2 3) */
+ next_1h = _mm_unpackhi_pi8(next_1, zero); /* row[-1][1]( 4 5 6 7) */
+ next1l = _mm_unpacklo_pi8(next1, zero); /* row[+1][1]( 0 1 2 3) */
+ next1h = _mm_unpackhi_pi8(next1, zero); /* row[+1][1]( 4 5 6 7) */
+
+ next0l = _mm_mullo_pi16(next0l, PW_THREE);
+ next0h = _mm_mullo_pi16(next0h, PW_THREE);
+
+ nextcolsum_1l = _mm_add_pi16(next_1l, next0l); /* ( 0 1 2 3) */
+ nextcolsum_1h = _mm_add_pi16(next_1h, next0h); /* ( 4 5 6 7) */
+ nextcolsum1l = _mm_add_pi16(next0l, next1l); /* ( 0 1 2 3) */
+ nextcolsum1h = _mm_add_pi16(next0h, next1h); /* ( 4 5 6 7) */
+
+ /* temporarily save the intermediate data */
+ _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l);
+ _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h);
+ _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l);
+ _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h);
+
+ wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */
+ wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */
+ } else {
+ __m64 tmp;
+
+ /* process the last column block */
+ tmp = _mm_load_si64((__m64 *)outptr0 + 1);
+ wk[2] = _mm_and_si64(masklast, tmp); /* ( - - - 7) */
+ tmp = _mm_load_si64((__m64 *)outptr1 + 1);
+ wk[3] = _mm_and_si64(masklast, tmp); /* ( - - - 7) */
+ }
+
+ /* process the upper row */
+ samp0123 = _mm_load_si64((__m64 *)outptr0); /* ( 0 1 2 3) */ \
+ samp4567 = _mm_load_si64((__m64 *)outptr0 + 1); /* ( 4 5 6 7) */ \
+ PROCESS_ROW(0, 2, PW_EIGHT, PW_SEVEN, 4)
+
+ /* process the lower row */
+ samp0123 = _mm_load_si64((__m64 *)outptr1); /* ( 0 1 2 3) */ \
+ samp4567 = _mm_load_si64((__m64 *)outptr1 + 1); /* ( 4 5 6 7) */ \
+ PROCESS_ROW(1, 2, PW_EIGHT, PW_SEVEN, 4)
+ }
+ }
+}
+
+
+void jsimd_h2v1_fancy_upsample_mmi(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, outptr0;
+ int inrow, incol, tmp, tmp1;
+ __m64 thisl, this, nextl, next;
+ __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[2], zero = 0.0;
+
+ mask0 = _mm_cmpeq_pi8(mask0, mask0);
+ masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+ mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+
+ inptr0 = input_data[inrow];
+ outptr0 = output_data[inrow];
+
+ if (downsampled_width & 7) {
+ tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+ tmp1 = downsampled_width * sizeof(JSAMPLE);
+ asm(PTR_ADDU "$8, %1, %2\r\n"
+ "lb $9, ($8)\r\n"
+ PTR_ADDU "$8, %1, %3\r\n"
+ "sb $9, ($8)\r\n"
+ : "=m" (*inptr0)
+ : "r" (inptr0), "r" (tmp), "r" (tmp1)
+ : "$8", "$9"
+ );
+ }
+
+ /* process the first column block */
+ this = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */
+ thisl = _mm_unpacklo_pi8(this, zero); /* row[ 0][0]( 0 1 2 3) */
+ wk[0] = _mm_and_si64(thisl, mask0); /* ( 0 - - -) */
+
+ for (incol = downsampled_width; incol > 0;
+ incol -= 8, inptr0 += 8, outptr0 += 16) {
+
+ if (incol > 8) {
+ /* process the next column block */
+ next = _mm_load_si64((__m64 *)inptr0 + 1); /* row[ 0][1] */
+ nextl = _mm_unpacklo_pi8(next, zero); /* row[ 0][1]( 0 1 2 3) */
+ wk[1] = _mm_slli_si64(nextl, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */
+ } else {
+ __m64 thish;
+
+ /* process the last column block */
+ this = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */
+ thish = _mm_unpackhi_pi8(this, zero); /* row[ 0][1]( 4 5 6 7) */
+ wk[1] = _mm_and_si64(masklast, thish); /* ( - - - 7) */
+ }
+
+ /* process the row */
+ this = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */
+ samp0123 = _mm_unpacklo_pi8(this, zero); /* ( 0 1 2 3) */
+ samp4567 = _mm_unpackhi_pi8(this, zero); /* ( 4 5 6 7) */
+ PROCESS_ROW(0, 1, PW_ONE, PW_TWO, 2)
+ }
+ }
+}
diff --git a/media/libjpeg/simd/mips64/jfdctfst-mmi.c b/media/libjpeg/simd/mips64/jfdctfst-mmi.c
new file mode 100644
index 0000000000..f7caf09a88
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jfdctfst-mmi.c
@@ -0,0 +1,255 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018-2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS 8
+
+#define F_0_382 ((short)98) /* FIX(0.382683433) */
+#define F_0_541 ((short)139) /* FIX(0.541196100) */
+#define F_0_707 ((short)181) /* FIX(0.707106781) */
+#define F_1_306 ((short)334) /* FIX(1.306562965) */
+
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+ index_PW_F0707,
+ index_PW_F0382,
+ index_PW_F0541,
+ index_PW_F1306
+};
+
+static uint64_t const_value[] = {
+ _uint64_set1_pi16(F_0_707),
+ _uint64_set1_pi16(F_0_382),
+ _uint64_set1_pi16(F_0_541),
+ _uint64_set1_pi16(F_1_306)
+};
+
+#define PW_F0707 get_const_value(index_PW_F0707)
+#define PW_F0382 get_const_value(index_PW_F0382)
+#define PW_F0541 get_const_value(index_PW_F0541)
+#define PW_F1306 get_const_value(index_PW_F1306)
+
+
+#define DO_FDCT_MULTIPLY(out, in, multiplier) { \
+ __m64 mulhi, mullo, mul12, mul34; \
+ \
+ mullo = _mm_mullo_pi16(in, multiplier); \
+ mulhi = _mm_mulhi_pi16(in, multiplier); \
+ mul12 = _mm_unpacklo_pi16(mullo, mulhi); \
+ mul34 = _mm_unpackhi_pi16(mullo, mulhi); \
+ mul12 = _mm_srai_pi32(mul12, CONST_BITS); \
+ mul34 = _mm_srai_pi32(mul34, CONST_BITS); \
+ out = _mm_packs_pi32(mul12, mul34); \
+}
+
+#define DO_FDCT_COMMON() { \
+ \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_pi16(tmp0, tmp3); \
+ tmp13 = _mm_sub_pi16(tmp0, tmp3); \
+ tmp11 = _mm_add_pi16(tmp1, tmp2); \
+ tmp12 = _mm_sub_pi16(tmp1, tmp2); \
+ \
+ out0 = _mm_add_pi16(tmp10, tmp11); \
+ out4 = _mm_sub_pi16(tmp10, tmp11); \
+ \
+ z1 = _mm_add_pi16(tmp12, tmp13); \
+ DO_FDCT_MULTIPLY(z1, z1, PW_F0707) \
+ \
+ out2 = _mm_add_pi16(tmp13, z1); \
+ out6 = _mm_sub_pi16(tmp13, z1); \
+ \
+ /* Odd part */ \
+ \
+ tmp10 = _mm_add_pi16(tmp4, tmp5); \
+ tmp11 = _mm_add_pi16(tmp5, tmp6); \
+ tmp12 = _mm_add_pi16(tmp6, tmp7); \
+ \
+ z5 = _mm_sub_pi16(tmp10, tmp12); \
+ DO_FDCT_MULTIPLY(z5, z5, PW_F0382) \
+ \
+ DO_FDCT_MULTIPLY(z2, tmp10, PW_F0541) \
+ z2 = _mm_add_pi16(z2, z5); \
+ \
+ DO_FDCT_MULTIPLY(z4, tmp12, PW_F1306) \
+ z4 = _mm_add_pi16(z4, z5); \
+ \
+ DO_FDCT_MULTIPLY(z3, tmp11, PW_F0707) \
+ \
+ z11 = _mm_add_pi16(tmp7, z3); \
+ z13 = _mm_sub_pi16(tmp7, z3); \
+ \
+ out5 = _mm_add_pi16(z13, z2); \
+ out3 = _mm_sub_pi16(z13, z2); \
+ out1 = _mm_add_pi16(z11, z4); \
+ out7 = _mm_sub_pi16(z11, z4); \
+}
+
+#define DO_FDCT_PASS1() { \
+ __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+ __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+ __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+ \
+ row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
+ row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+ row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
+ row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+ row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
+ row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+ row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
+ row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+ \
+ /* Transpose coefficients */ \
+ \
+ row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \
+ row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \
+ row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \
+ row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \
+ \
+ row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \
+ row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \
+ row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \
+ row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \
+ \
+ col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \
+ col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \
+ col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \
+ col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \
+ \
+ tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \
+ tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \
+ tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \
+ tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \
+ \
+ col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \
+ col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \
+ col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \
+ col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \
+ \
+ tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \
+ tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \
+ tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \
+ tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \
+ \
+ DO_FDCT_COMMON() \
+ \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+ __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+ __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+ __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+ \
+ col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 10 20 30) */ \
+ col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (01 11 21 31) */ \
+ col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (02 12 22 32) */ \
+ col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (03 13 23 33) */ \
+ col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]); /* (40 50 60 70) */ \
+ col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]); /* (41 51 61 71) */ \
+ col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]); /* (42 52 62 72) */ \
+ col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]); /* (43 53 63 73) */ \
+ \
+ /* Transpose coefficients */ \
+ \
+ col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \
+ col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \
+ col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \
+ col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \
+ \
+ col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \
+ col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \
+ col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \
+ col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \
+ \
+ row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \
+ row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \
+ row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \
+ row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \
+ \
+ tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \
+ tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \
+ tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \
+ tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \
+ \
+ row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \
+ row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \
+ row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \
+ row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \
+ \
+ tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \
+ tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \
+ tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \
+ tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \
+ \
+ DO_FDCT_COMMON() \
+ \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_ifast_mmi(DCTELEM *data)
+{
+ __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+ __m64 tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
+ DCTELEM *dataptr = data;
+
+ /* Pass 1: process rows. */
+
+ DO_FDCT_PASS1()
+ dataptr += DCTSIZE * 4;
+ DO_FDCT_PASS1()
+
+ /* Pass 2: process columns. */
+
+ dataptr = data;
+ DO_FDCT_PASS2()
+ dataptr += 4;
+ DO_FDCT_PASS2()
+}
diff --git a/media/libjpeg/simd/mips64/jfdctint-mmi.c b/media/libjpeg/simd/mips64/jfdctint-mmi.c
new file mode 100644
index 0000000000..7f4dfe9123
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jfdctint-mmi.c
@@ -0,0 +1,398 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018, 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */
+#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */
+#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */
+#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */
+#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */
+#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */
+#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */
+#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */
+#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */
+#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */
+#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */
+#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */
+
+enum const_index {
+ index_PW_F130_F054,
+ index_PW_F054_MF130,
+ index_PW_MF078_F117,
+ index_PW_F117_F078,
+ index_PW_MF060_MF089,
+ index_PW_MF089_F060,
+ index_PW_MF050_MF256,
+ index_PW_MF256_F050,
+ index_PD_DESCALE_P1,
+ index_PD_DESCALE_P2,
+ index_PW_DESCALE_P2X
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+ FIX_0_541, (FIX_0_541 + FIX_0_765)),
+ _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+ (FIX_0_541 - FIX_1_847), FIX_0_541),
+ _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+ FIX_1_175, (FIX_1_175 - FIX_1_961)),
+ _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+ (FIX_1_175 - FIX_0_390), FIX_1_175),
+ _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+ -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+ _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+ (FIX_1_501 - FIX_0_899), -FIX_0_899),
+ _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+ -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+ _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+ (FIX_3_072 - FIX_2_562), -FIX_2_562),
+ _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+ _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+ _uint64_set_pi16((1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)),
+ (1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)))
+};
+
+#define PW_F130_F054 get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130 get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117 get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078 get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060 get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050 get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2)
+#define PW_DESCALE_P2X get_const_value(index_PW_DESCALE_P2X)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+ __m64 tmp1312l, tmp1312h, tmp47l, tmp47h, tmp4l, tmp4h, tmp7l, tmp7h; \
+ __m64 tmp56l, tmp56h, tmp5l, tmp5h, tmp6l, tmp6h; \
+ __m64 out1l, out1h, out2l, out2h, out3l, out3h; \
+ __m64 out5l, out5h, out6l, out6h, out7l, out7h; \
+ __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+ \
+ /* (Original) \
+ * z1 = (tmp12 + tmp13) * 0.541196100; \
+ * out2 = z1 + tmp13 * 0.765366865; \
+ * out6 = z1 + tmp12 * -1.847759065; \
+ * \
+ * (This implementation) \
+ * out2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+ * out6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+ */ \
+ \
+ tmp1312l = _mm_unpacklo_pi16(tmp13, tmp12); \
+ tmp1312h = _mm_unpackhi_pi16(tmp13, tmp12); \
+ \
+ out2l = _mm_madd_pi16(tmp1312l, PW_F130_F054); \
+ out2h = _mm_madd_pi16(tmp1312h, PW_F130_F054); \
+ out6l = _mm_madd_pi16(tmp1312l, PW_F054_MF130); \
+ out6h = _mm_madd_pi16(tmp1312h, PW_F054_MF130); \
+ \
+ out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+ out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+ out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+ out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+ \
+ out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+ out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+ out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+ out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+ \
+ out2 = _mm_packs_pi32(out2l, out2h); \
+ out6 = _mm_packs_pi32(out6l, out6h); \
+ \
+ /* Odd part */ \
+ \
+ z3 = _mm_add_pi16(tmp4, tmp6); \
+ z4 = _mm_add_pi16(tmp5, tmp7); \
+ \
+ /* (Original) \
+ * z5 = (z3 + z4) * 1.175875602; \
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
+ * z3 += z5; z4 += z5; \
+ * \
+ * (This implementation) \
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+ */ \
+ \
+ z34l = _mm_unpacklo_pi16(z3, z4); \
+ z34h = _mm_unpackhi_pi16(z3, z4); \
+ z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+ z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+ z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+ z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+ \
+ /* (Original) \
+ * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
+ * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
+ * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
+ * out7 = tmp4 + z1 + z3; out5 = tmp5 + z2 + z4; \
+ * out3 = tmp6 + z2 + z3; out1 = tmp7 + z1 + z4; \
+ * \
+ * (This implementation) \
+ * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+ * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+ * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+ * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+ * out7 = tmp4 + z3; out5 = tmp5 + z4; \
+ * out3 = tmp6 + z3; out1 = tmp7 + z4; \
+ */ \
+ \
+ tmp47l = _mm_unpacklo_pi16(tmp4, tmp7); \
+ tmp47h = _mm_unpackhi_pi16(tmp4, tmp7); \
+ \
+ tmp4l = _mm_madd_pi16(tmp47l, PW_MF060_MF089); \
+ tmp4h = _mm_madd_pi16(tmp47h, PW_MF060_MF089); \
+ tmp7l = _mm_madd_pi16(tmp47l, PW_MF089_F060); \
+ tmp7h = _mm_madd_pi16(tmp47h, PW_MF089_F060); \
+ \
+ out7l = _mm_add_pi32(tmp4l, z3l); \
+ out7h = _mm_add_pi32(tmp4h, z3h); \
+ out1l = _mm_add_pi32(tmp7l, z4l); \
+ out1h = _mm_add_pi32(tmp7h, z4h); \
+ \
+ out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+ out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+ out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+ out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+ \
+ out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+ out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+ out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+ out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+ \
+ out7 = _mm_packs_pi32(out7l, out7h); \
+ out1 = _mm_packs_pi32(out1l, out1h); \
+ \
+ tmp56l = _mm_unpacklo_pi16(tmp5, tmp6); \
+ tmp56h = _mm_unpackhi_pi16(tmp5, tmp6); \
+ \
+ tmp5l = _mm_madd_pi16(tmp56l, PW_MF050_MF256); \
+ tmp5h = _mm_madd_pi16(tmp56h, PW_MF050_MF256); \
+ tmp6l = _mm_madd_pi16(tmp56l, PW_MF256_F050); \
+ tmp6h = _mm_madd_pi16(tmp56h, PW_MF256_F050); \
+ \
+ out5l = _mm_add_pi32(tmp5l, z4l); \
+ out5h = _mm_add_pi32(tmp5h, z4h); \
+ out3l = _mm_add_pi32(tmp6l, z3l); \
+ out3h = _mm_add_pi32(tmp6h, z3h); \
+ \
+ out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+ out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+ out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+ out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+ \
+ out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+ out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+ out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+ out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+ \
+ out5 = _mm_packs_pi32(out5l, out5h); \
+ out3 = _mm_packs_pi32(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+ __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+ __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+ __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+ __m64 tmp10, tmp11; \
+ \
+ row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
+ row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+ row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
+ row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+ row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
+ row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+ row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
+ row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+ \
+ /* Transpose coefficients */ \
+ \
+ row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \
+ row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \
+ row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \
+ row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \
+ \
+ row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \
+ row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \
+ row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \
+ row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \
+ \
+ col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \
+ col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \
+ col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \
+ col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \
+ \
+ tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \
+ tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \
+ tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \
+ tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \
+ \
+ col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \
+ col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \
+ col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \
+ col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \
+ \
+ tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \
+ tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \
+ tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \
+ tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \
+ \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \
+ tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \
+ tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \
+ tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \
+ \
+ out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \
+ out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \
+ out0 = _mm_slli_pi16(out0, PASS1_BITS); \
+ out4 = _mm_slli_pi16(out4, PASS1_BITS); \
+ \
+ DO_FDCT_COMMON(1) \
+ \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+ __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+ __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+ __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+ __m64 tmp10, tmp11; \
+ \
+ col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 10 20 30) */ \
+ col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (01 11 21 31) */ \
+ col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (02 12 22 32) */ \
+ col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (03 13 23 33) */ \
+ col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]); /* (40 50 60 70) */ \
+ col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]); /* (41 51 61 71) */ \
+ col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]); /* (42 52 62 72) */ \
+ col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]); /* (43 53 63 73) */ \
+ \
+ /* Transpose coefficients */ \
+ \
+ col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \
+ col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \
+ col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \
+ col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \
+ \
+ col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \
+ col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \
+ col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \
+ col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \
+ \
+ row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \
+ row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \
+ row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \
+ row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \
+ \
+ tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \
+ tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \
+ tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \
+ tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \
+ \
+ row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \
+ row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \
+ row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \
+ row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \
+ \
+ tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \
+ tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \
+ tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \
+ tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \
+ \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \
+ tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \
+ tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \
+ tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \
+ \
+ out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \
+ out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \
+ \
+ out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \
+ out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \
+ out0 = _mm_srai_pi16(out0, PASS1_BITS); \
+ out4 = _mm_srai_pi16(out4, PASS1_BITS); \
+ \
+ DO_FDCT_COMMON(2) \
+ \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+ _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_islow_mmi(DCTELEM *data)
+{
+ __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+ __m64 tmp12, tmp13;
+ DCTELEM *dataptr = data;
+
+ /* Pass 1: process rows. */
+
+ DO_FDCT_PASS1()
+ dataptr += DCTSIZE * 4;
+ DO_FDCT_PASS1()
+
+ /* Pass 2: process columns. */
+
+ dataptr = data;
+ DO_FDCT_PASS2()
+ dataptr += 4;
+ DO_FDCT_PASS2()
+}
diff --git a/media/libjpeg/simd/mips64/jidctfst-mmi.c b/media/libjpeg/simd/mips64/jidctfst-mmi.c
new file mode 100644
index 0000000000..503bb35a3c
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jidctfst-mmi.c
@@ -0,0 +1,395 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018-2019, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS 8
+#define PASS1_BITS 2
+
+#define FIX_1_082 ((short)277) /* FIX(1.082392200) */
+#define FIX_1_414 ((short)362) /* FIX(1.414213562) */
+#define FIX_1_847 ((short)473) /* FIX(1.847759065) */
+#define FIX_2_613 ((short)669) /* FIX(2.613125930) */
+#define FIX_1_613 ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */
+
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+ index_PW_F1082,
+ index_PW_F1414,
+ index_PW_F1847,
+ index_PW_MF1613,
+ index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+ _uint64_set1_pi16(FIX_1_082 << CONST_SHIFT),
+ _uint64_set1_pi16(FIX_1_414 << CONST_SHIFT),
+ _uint64_set1_pi16(FIX_1_847 << CONST_SHIFT),
+ _uint64_set1_pi16(-FIX_1_613 << CONST_SHIFT),
+ _uint64_set1_pi8(CENTERJSAMPLE)
+};
+
+#define PW_F1414 get_const_value(index_PW_F1414)
+#define PW_F1847 get_const_value(index_PW_F1847)
+#define PW_MF1613 get_const_value(index_PW_MF1613)
+#define PW_F1082 get_const_value(index_PW_F1082)
+#define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32) (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64) (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON() { \
+ tmp7 = _mm_add_pi16(z11, z13); \
+ \
+ tmp11 = _mm_sub_pi16(z11, z13); \
+ tmp11 = _mm_slli_pi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+ tmp11 = _mm_mulhi_pi16(tmp11, PW_F1414); \
+ \
+ tmp10 = _mm_slli_pi16(z12, PRE_MULTIPLY_SCALE_BITS); \
+ tmp12 = _mm_slli_pi16(z10, PRE_MULTIPLY_SCALE_BITS); \
+ \
+ /* To avoid overflow... \
+ * \
+ * (Original) \
+ * tmp12 = -2.613125930 * z10 + z5; \
+ * \
+ * (This implementation) \
+ * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+ * = -1.613125930 * z10 - z10 + z5; \
+ */ \
+ \
+ z5 = _mm_add_pi16(tmp10, tmp12); \
+ z5 = _mm_mulhi_pi16(z5, PW_F1847); \
+ \
+ tmp10 = _mm_mulhi_pi16(tmp10, PW_F1082); \
+ tmp10 = _mm_sub_pi16(tmp10, z5); \
+ tmp12 = _mm_mulhi_pi16(tmp12, PW_MF1613); \
+ tmp12 = _mm_sub_pi16(tmp12, z10); \
+ tmp12 = _mm_sub_pi16(tmp12, z10); \
+ tmp12 = _mm_sub_pi16(tmp12, z10); \
+ tmp12 = _mm_add_pi16(tmp12, z5); \
+ \
+ /* Final output stage */ \
+ \
+ tmp6 = _mm_sub_pi16(tmp12, tmp7); \
+ tmp5 = _mm_sub_pi16(tmp11, tmp6); \
+ tmp4 = _mm_add_pi16(tmp10, tmp5); \
+ \
+ out0 = _mm_add_pi16(tmp0, tmp7); \
+ out7 = _mm_sub_pi16(tmp0, tmp7); \
+ out1 = _mm_add_pi16(tmp1, tmp6); \
+ out6 = _mm_sub_pi16(tmp1, tmp6); \
+ \
+ out2 = _mm_add_pi16(tmp2, tmp5); \
+ out5 = _mm_sub_pi16(tmp2, tmp5); \
+ out4 = _mm_add_pi16(tmp3, tmp4); \
+ out3 = _mm_sub_pi16(tmp3, tmp4); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+ __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+ __m64 quant0l, quant1l, quant2l, quant3l; \
+ __m64 quant4l, quant5l, quant6l, quant7l; \
+ __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+ __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+ __m32 col0a, col1a, mm0; \
+ \
+ col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+ col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+ mm0 = _mm_or_si32(col0a, col1a); \
+ \
+ if (test_m32_zero(mm0)) { \
+ __m64 mm1, mm2; \
+ \
+ col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+ col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+ col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+ col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+ col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+ col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+ col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+ col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+ \
+ mm1 = _mm_or_si64(col1l, col3l); \
+ mm2 = _mm_or_si64(col2l, col4l); \
+ mm1 = _mm_or_si64(mm1, col5l); \
+ mm2 = _mm_or_si64(mm2, col6l); \
+ mm1 = _mm_or_si64(mm1, col7l); \
+ mm1 = _mm_or_si64(mm1, mm2); \
+ \
+ if (test_m64_zero(mm1)) { \
+ __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+ \
+ /* AC terms all zero */ \
+ \
+ quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+ \
+ dcval = _mm_mullo_pi16(col0l, quant0l); /* dcval=(00 10 20 30) */ \
+ \
+ dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \
+ dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \
+ \
+ row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \
+ row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \
+ row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \
+ row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \
+ \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+ \
+ goto nextcolumn##iter; \
+ } \
+ } \
+ \
+ /* Even part */ \
+ \
+ col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); /* (00 10 20 30) */ \
+ col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); /* (02 12 22 32) */ \
+ col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); /* (04 14 24 34) */ \
+ col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); /* (06 16 26 36) */ \
+ \
+ quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+ quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+ quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+ quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+ \
+ tmp0 = _mm_mullo_pi16(col0l, quant0l); \
+ tmp1 = _mm_mullo_pi16(col2l, quant2l); \
+ tmp2 = _mm_mullo_pi16(col4l, quant4l); \
+ tmp3 = _mm_mullo_pi16(col6l, quant6l); \
+ \
+ tmp10 = _mm_add_pi16(tmp0, tmp2); \
+ tmp11 = _mm_sub_pi16(tmp0, tmp2); \
+ tmp13 = _mm_add_pi16(tmp1, tmp3); \
+ \
+ tmp12 = _mm_sub_pi16(tmp1, tmp3); \
+ tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+ tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+ tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+ \
+ tmp0 = _mm_add_pi16(tmp10, tmp13); \
+ tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+ tmp1 = _mm_add_pi16(tmp11, tmp12); \
+ tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+ \
+ /* Odd part */ \
+ \
+ col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); /* (01 11 21 31) */ \
+ col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); /* (03 13 23 33) */ \
+ col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); /* (05 15 25 35) */ \
+ col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); /* (07 17 27 37) */ \
+ \
+ quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+ quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+ quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+ quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+ \
+ tmp4 = _mm_mullo_pi16(col1l, quant1l); \
+ tmp5 = _mm_mullo_pi16(col3l, quant3l); \
+ tmp6 = _mm_mullo_pi16(col5l, quant5l); \
+ tmp7 = _mm_mullo_pi16(col7l, quant7l); \
+ \
+ z13 = _mm_add_pi16(tmp6, tmp5); \
+ z10 = _mm_sub_pi16(tmp6, tmp5); \
+ z11 = _mm_add_pi16(tmp4, tmp7); \
+ z12 = _mm_sub_pi16(tmp4, tmp7); \
+ \
+ DO_IDCT_COMMON() \
+ \
+ /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+ /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+ /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+ /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+ \
+ /* Transpose coefficients */ \
+ \
+ row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \
+ row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \
+ row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \
+ row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \
+ \
+ row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \
+ row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \
+ row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \
+ row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \
+ \
+ row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \
+ row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \
+ row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \
+ row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \
+ \
+ row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \
+ row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \
+ row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \
+ row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \
+ \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+ __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+ __m64 col0123a, col0123b, col0123c, col0123d; \
+ __m64 col01l, col01h, col23l, col23h; \
+ __m64 col0, col1, col2, col3; \
+ __m64 row06, row17, row24, row35; \
+ \
+ row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
+ row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
+ row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
+ row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
+ row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]); /* (40 41 42 43) */ \
+ row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]); /* (50 51 52 53) */ \
+ row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]); /* (60 61 62 63) */ \
+ row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]); /* (70 71 72 73) */ \
+ \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_pi16(row0l, row4l); \
+ tmp11 = _mm_sub_pi16(row0l, row4l); \
+ tmp13 = _mm_add_pi16(row2l, row6l); \
+ \
+ tmp12 = _mm_sub_pi16(row2l, row6l); \
+ tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+ tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+ tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+ \
+ tmp0 = _mm_add_pi16(tmp10, tmp13); \
+ tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+ tmp1 = _mm_add_pi16(tmp11, tmp12); \
+ tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+ \
+ /* Odd part */ \
+ \
+ z13 = _mm_add_pi16(row5l, row3l); \
+ z10 = _mm_sub_pi16(row5l, row3l); \
+ z11 = _mm_add_pi16(row1l, row7l); \
+ z12 = _mm_sub_pi16(row1l, row7l); \
+ \
+ DO_IDCT_COMMON() \
+ \
+ /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+ /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+ /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+ /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+ \
+ out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \
+ out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \
+ out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \
+ out3 = _mm_srai_pi16(out3, PASS1_BITS + 3); \
+ out4 = _mm_srai_pi16(out4, PASS1_BITS + 3); \
+ out5 = _mm_srai_pi16(out5, PASS1_BITS + 3); \
+ out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \
+ out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \
+ \
+ row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \
+ row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \
+ row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \
+ row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \
+ \
+ row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+ row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+ row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+ row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+ \
+ /* Transpose coefficients */ \
+ \
+ col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \
+ col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \
+ col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \
+ col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \
+ \
+ col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \
+ col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \
+ col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \
+ col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \
+ \
+ col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \
+ col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \
+ col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \
+ col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \
+ \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_ifast_mmi(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m64 tmp10, tmp11, tmp12, tmp13;
+ __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+ __m64 z5, z10, z11, z12, z13;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ JCOEF *wsptr;
+ JCOEF workspace[DCTSIZE2]; /* buffers data between passes */
+
+ /* Pass 1: process columns. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)dct_table;
+ wsptr = workspace;
+
+ DO_IDCT_PASS1(1)
+nextcolumn1:
+ inptr += 4;
+ quantptr += 4;
+ wsptr += DCTSIZE * 4;
+ DO_IDCT_PASS1(2)
+nextcolumn2:
+
+ /* Pass 2: process rows. */
+
+ wsptr = workspace;
+
+ DO_IDCT_PASS2(0)
+ wsptr += 4;
+ DO_IDCT_PASS2(4)
+}
diff --git a/media/libjpeg/simd/mips64/jidctint-mmi.c b/media/libjpeg/simd/mips64/jidctint-mmi.c
new file mode 100644
index 0000000000..cd3db980c5
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jidctint-mmi.c
@@ -0,0 +1,571 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCUATE INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+#define CENTERJSAMPLE 128
+
+#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */
+#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */
+#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */
+#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */
+#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */
+#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */
+#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */
+#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */
+#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */
+#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */
+#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */
+#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */
+
+enum const_index {
+ index_PW_F130_F054,
+ index_PW_F054_MF130,
+ index_PW_MF078_F117,
+ index_PW_F117_F078,
+ index_PW_MF060_MF089,
+ index_PW_MF089_F060,
+ index_PW_MF050_MF256,
+ index_PW_MF256_F050,
+ index_PD_DESCALE_P1,
+ index_PD_DESCALE_P2,
+ index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+ _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+ FIX_0_541, (FIX_0_541 + FIX_0_765)),
+ _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+ (FIX_0_541 - FIX_1_847), FIX_0_541),
+ _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+ FIX_1_175, (FIX_1_175 - FIX_1_961)),
+ _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+ (FIX_1_175 - FIX_0_390), FIX_1_175),
+ _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+ -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+ _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+ (FIX_1_501 - FIX_0_899), -FIX_0_899),
+ _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+ -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+ _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+ (FIX_3_072 - FIX_2_562), -FIX_2_562),
+ _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+ _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+ _uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE,
+ CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE)
+};
+
+#define PW_F130_F054 get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130 get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117 get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078 get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060 get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050 get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2)
+#define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32) (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64) (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON(PASS) { \
+ __m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \
+ __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+ __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+ __m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \
+ __m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \
+ \
+ z3 = _mm_add_pi16(tmp0, tmp2); \
+ z4 = _mm_add_pi16(tmp1, tmp3); \
+ \
+ /* (Original) \
+ * z5 = (z3 + z4) * 1.175875602; \
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
+ * z3 += z5; z4 += z5; \
+ * \
+ * (This implementation) \
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+ */ \
+ \
+ z34l = _mm_unpacklo_pi16(z3, z4); \
+ z34h = _mm_unpackhi_pi16(z3, z4); \
+ z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+ z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+ z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+ z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+ \
+ /* (Original) \
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
+ * tmp0 += z1 + z3; tmp1 += z2 + z4; \
+ * tmp2 += z2 + z3; tmp3 += z1 + z4; \
+ * \
+ * (This implementation) \
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+ * tmp0 += z3; tmp1 += z4; \
+ * tmp2 += z3; tmp3 += z4; \
+ */ \
+ \
+ tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \
+ tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \
+ \
+ tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \
+ tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \
+ tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \
+ tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \
+ \
+ tmp0l = _mm_add_pi32(tmp0l, z3l); \
+ tmp0h = _mm_add_pi32(tmp0h, z3h); \
+ tmp3l = _mm_add_pi32(tmp3l, z4l); \
+ tmp3h = _mm_add_pi32(tmp3h, z4h); \
+ \
+ tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \
+ tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \
+ \
+ tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \
+ tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \
+ tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \
+ tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \
+ \
+ tmp1l = _mm_add_pi32(tmp1l, z4l); \
+ tmp1h = _mm_add_pi32(tmp1h, z4h); \
+ tmp2l = _mm_add_pi32(tmp2l, z3l); \
+ tmp2h = _mm_add_pi32(tmp2h, z3h); \
+ \
+ /* Final output stage */ \
+ \
+ out0l = _mm_add_pi32(tmp10l, tmp3l); \
+ out0h = _mm_add_pi32(tmp10h, tmp3h); \
+ out7l = _mm_sub_pi32(tmp10l, tmp3l); \
+ out7h = _mm_sub_pi32(tmp10h, tmp3h); \
+ \
+ out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \
+ out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \
+ out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \
+ out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \
+ \
+ out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+ out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+ out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+ out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+ \
+ out0 = _mm_packs_pi32(out0l, out0h); \
+ out7 = _mm_packs_pi32(out7l, out7h); \
+ \
+ out1l = _mm_add_pi32(tmp11l, tmp2l); \
+ out1h = _mm_add_pi32(tmp11h, tmp2h); \
+ out6l = _mm_sub_pi32(tmp11l, tmp2l); \
+ out6h = _mm_sub_pi32(tmp11h, tmp2h); \
+ \
+ out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+ out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+ out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+ out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+ \
+ out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+ out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+ out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+ out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+ \
+ out1 = _mm_packs_pi32(out1l, out1h); \
+ out6 = _mm_packs_pi32(out6l, out6h); \
+ \
+ out2l = _mm_add_pi32(tmp12l, tmp1l); \
+ out2h = _mm_add_pi32(tmp12h, tmp1h); \
+ out5l = _mm_sub_pi32(tmp12l, tmp1l); \
+ out5h = _mm_sub_pi32(tmp12h, tmp1h); \
+ \
+ out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+ out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+ out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+ out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+ \
+ out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+ out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+ out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+ out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+ \
+ out2 = _mm_packs_pi32(out2l, out2h); \
+ out5 = _mm_packs_pi32(out5l, out5h); \
+ \
+ out3l = _mm_add_pi32(tmp13l, tmp0l); \
+ out3h = _mm_add_pi32(tmp13h, tmp0h); \
+ \
+ out4l = _mm_sub_pi32(tmp13l, tmp0l); \
+ out4h = _mm_sub_pi32(tmp13h, tmp0h); \
+ \
+ out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+ out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+ out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+ out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+ \
+ out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \
+ out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \
+ out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \
+ out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \
+ \
+ out3 = _mm_packs_pi32(out3l, out3h); \
+ out4 = _mm_packs_pi32(out4l, out4h); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+ __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+ __m64 quant0l, quant1l, quant2l, quant3l; \
+ __m64 quant4l, quant5l, quant6l, quant7l; \
+ __m64 z23, z2, z3, z23l, z23h; \
+ __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+ __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+ __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+ __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+ __m32 col0a, col1a, mm0; \
+ \
+ col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+ col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+ mm0 = _mm_or_si32(col0a, col1a); \
+ \
+ if (test_m32_zero(mm0)) { \
+ __m64 mm1, mm2; \
+ \
+ col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+ col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+ col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+ col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+ col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+ col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+ col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+ col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+ \
+ mm1 = _mm_or_si64(col1l, col3l); \
+ mm2 = _mm_or_si64(col2l, col4l); \
+ mm1 = _mm_or_si64(mm1, col5l); \
+ mm2 = _mm_or_si64(mm2, col6l); \
+ mm1 = _mm_or_si64(mm1, col7l); \
+ mm1 = _mm_or_si64(mm1, mm2); \
+ \
+ if (test_m64_zero(mm1)) { \
+ __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+ \
+ /* AC terms all zero */ \
+ \
+ quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+ \
+ dcval = _mm_mullo_pi16(col0l, quant0l); \
+ dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval=(00 10 20 30) */ \
+ \
+ dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \
+ dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \
+ \
+ row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \
+ row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \
+ row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \
+ row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \
+ \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+ \
+ goto nextcolumn##iter; \
+ } \
+ } \
+ \
+ /* Even part \
+ * \
+ * (Original) \
+ * z1 = (z2 + z3) * 0.541196100; \
+ * tmp2 = z1 + z3 * -1.847759065; \
+ * tmp3 = z1 + z2 * 0.765366865; \
+ * \
+ * (This implementation) \
+ * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+ * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+ */ \
+ \
+ col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); /* (00 10 20 30) */ \
+ col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); /* (02 12 22 32) */ \
+ col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); /* (04 14 24 34) */ \
+ col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); /* (06 16 26 36) */ \
+ \
+ quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+ quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+ quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+ quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+ \
+ z2 = _mm_mullo_pi16(col2l, quant2l); \
+ z3 = _mm_mullo_pi16(col6l, quant6l); \
+ \
+ z23l = _mm_unpacklo_pi16(z2, z3); \
+ z23h = _mm_unpackhi_pi16(z2, z3); \
+ tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+ tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+ tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+ tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+ \
+ z2 = _mm_mullo_pi16(col0l, quant0l); \
+ z3 = _mm_mullo_pi16(col4l, quant4l); \
+ \
+ z23 = _mm_add_pi16(z2, z3); \
+ tmp0l = _mm_loadlo_pi16_f(z23); \
+ tmp0h = _mm_loadhi_pi16_f(z23); \
+ tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+ tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+ \
+ tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+ tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+ tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+ tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+ \
+ z23 = _mm_sub_pi16(z2, z3); \
+ tmp1l = _mm_loadlo_pi16_f(z23); \
+ tmp1h = _mm_loadhi_pi16_f(z23); \
+ tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+ tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+ \
+ tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+ tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+ tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+ tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+ \
+ /* Odd part */ \
+ \
+ col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); /* (01 11 21 31) */ \
+ col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); /* (03 13 23 33) */ \
+ col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); /* (05 15 25 35) */ \
+ col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); /* (07 17 27 37) */ \
+ \
+ quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+ quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+ quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+ quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+ \
+ tmp0 = _mm_mullo_pi16(col7l, quant7l); \
+ tmp1 = _mm_mullo_pi16(col5l, quant5l); \
+ tmp2 = _mm_mullo_pi16(col3l, quant3l); \
+ tmp3 = _mm_mullo_pi16(col1l, quant1l); \
+ \
+ DO_IDCT_COMMON(1) \
+ \
+ /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+ /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+ /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+ /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+ \
+ /* Transpose coefficients */ \
+ \
+ row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \
+ row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \
+ row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \
+ row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \
+ \
+ row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \
+ row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \
+ row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \
+ row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \
+ \
+ row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \
+ row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \
+ row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \
+ row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \
+ \
+ row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \
+ row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \
+ row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \
+ row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \
+ \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+ __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+ __m64 z23, z23l, z23h; \
+ __m64 col0123a, col0123b, col0123c, col0123d; \
+ __m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \
+ __m64 col0, col1, col2, col3; \
+ __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+ __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+ \
+ row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
+ row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
+ row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
+ row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
+ row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]); /* (40 41 42 43) */ \
+ row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]); /* (50 51 52 53) */ \
+ row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]); /* (60 61 62 63) */ \
+ row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]); /* (70 71 72 73) */ \
+ \
+ /* Even part \
+ * \
+ * (Original) \
+ * z1 = (z2 + z3) * 0.541196100; \
+ * tmp2 = z1 + z3 * -1.847759065; \
+ * tmp3 = z1 + z2 * 0.765366865; \
+ * \
+ * (This implementation) \
+ * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+ * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+ */ \
+ \
+ z23l = _mm_unpacklo_pi16(row2l, row6l); \
+ z23h = _mm_unpackhi_pi16(row2l, row6l); \
+ \
+ tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+ tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+ tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+ tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+ \
+ z23 = _mm_add_pi16(row0l, row4l); \
+ tmp0l = _mm_loadlo_pi16_f(z23); \
+ tmp0h = _mm_loadhi_pi16_f(z23); \
+ tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+ tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+ \
+ tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+ tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+ tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+ tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+ \
+ z23 = _mm_sub_pi16(row0l, row4l); \
+ tmp1l = _mm_loadlo_pi16_f(z23); \
+ tmp1h = _mm_loadhi_pi16_f(z23); \
+ tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+ tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+ \
+ tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+ tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+ tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+ tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+ \
+ /* Odd part */ \
+ \
+ tmp0 = row7l; \
+ tmp1 = row5l; \
+ tmp2 = row3l; \
+ tmp3 = row1l; \
+ \
+ DO_IDCT_COMMON(2) \
+ \
+ /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+ /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+ /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+ /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+ \
+ row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \
+ row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \
+ row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \
+ row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \
+ \
+ row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+ row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+ row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+ row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+ \
+ /* Transpose coefficients */ \
+ \
+ col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \
+ col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \
+ col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \
+ col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \
+ \
+ col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \
+ col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \
+ col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \
+ col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \
+ \
+ col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \
+ col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \
+ col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \
+ col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \
+ \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+ _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_islow_mmi(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ __m64 tmp0, tmp1, tmp2, tmp3;
+ __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE *quantptr;
+ JCOEF *wsptr;
+ JCOEF workspace[DCTSIZE2]; /* buffers data between passes */
+
+ /* Pass 1: process columns. */
+
+ inptr = coef_block;
+ quantptr = (ISLOW_MULT_TYPE *)dct_table;
+ wsptr = workspace;
+
+ DO_IDCT_PASS1(1)
+nextcolumn1:
+ inptr += 4;
+ quantptr += 4;
+ wsptr += DCTSIZE * 4;
+ DO_IDCT_PASS1(2)
+nextcolumn2:
+
+ /* Pass 2: process rows. */
+
+ wsptr = workspace;
+
+ DO_IDCT_PASS2(0)
+ wsptr += 4;
+ DO_IDCT_PASS2(4)
+}
diff --git a/media/libjpeg/simd/mips64/jquanti-mmi.c b/media/libjpeg/simd/mips64/jquanti-mmi.c
new file mode 100644
index 0000000000..339002fd80
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jquanti-mmi.c
@@ -0,0 +1,124 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * Copyright (C) 2018-2019, D. R. Commander. All Rights Reserved.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define DO_QUANT() { \
+ __m64 rowl, rowh, rowls, rowhs, rowlsave, rowhsave; \
+ __m64 corrl, corrh, recipl, reciph, scalel, scaleh; \
+ \
+ rowl = _mm_load_si64((__m64 *)&workspace[0]); \
+ rowh = _mm_load_si64((__m64 *)&workspace[4]); \
+ \
+ /* Branch-less absolute value */ \
+ rowls = _mm_srai_pi16(rowl, (WORD_BIT - 1)); /* -1 if value < 0, */ \
+ /* 0 otherwise */ \
+ rowhs = _mm_srai_pi16(rowh, (WORD_BIT - 1)); \
+ \
+ rowl = _mm_xor_si64(rowl, rowls); /* val = -val */ \
+ rowh = _mm_xor_si64(rowh, rowhs); \
+ rowl = _mm_sub_pi16(rowl, rowls); \
+ rowh = _mm_sub_pi16(rowh, rowhs); \
+ \
+ corrl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]); /* correction */ \
+ corrh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
+ \
+ rowlsave = rowl = _mm_add_pi16(rowl, corrl); /* correction + roundfactor */ \
+ rowhsave = rowh = _mm_add_pi16(rowh, corrh); \
+ \
+ recipl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]); /* reciprocal */ \
+ reciph = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
+ \
+ rowl = _mm_mulhi_pi16(rowl, recipl); \
+ rowh = _mm_mulhi_pi16(rowh, reciph); \
+ \
+ /* reciprocal is always negative (MSB=1), so we always need to add the */ \
+ /* initial value (input value is never negative as we inverted it at the */ \
+ /* start of this routine) */ \
+ rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \
+ rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \
+ \
+ scalel = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]); /* scale */ \
+ scaleh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
+ \
+ rowl = _mm_mulhi_pi16(rowl, scalel); \
+ rowh = _mm_mulhi_pi16(rowh, scaleh); \
+ \
+ /* determine if scale is negative */ \
+ scalel = _mm_srai_pi16(scalel, (WORD_BIT - 1)); \
+ scaleh = _mm_srai_pi16(scaleh, (WORD_BIT - 1)); \
+ \
+ /* and add input if it is */ \
+ scalel = _mm_and_si64(scalel, rowlsave); \
+ scaleh = _mm_and_si64(scaleh, rowhsave); \
+ rowl = _mm_add_pi16(rowl, scalel); \
+ rowh = _mm_add_pi16(rowh, scaleh); \
+ \
+ /* then check if negative input */ \
+ rowlsave = _mm_srai_pi16(rowlsave, (WORD_BIT - 1)); \
+ rowhsave = _mm_srai_pi16(rowhsave, (WORD_BIT - 1)); \
+ \
+ /* and add scale if it is */ \
+ rowlsave = _mm_and_si64(rowlsave, scalel); \
+ rowhsave = _mm_and_si64(rowhsave, scaleh); \
+ rowl = _mm_add_pi16(rowl, rowlsave); \
+ rowh = _mm_add_pi16(rowh, rowhsave); \
+ \
+ rowl = _mm_xor_si64(rowl, rowls); /* val = -val */ \
+ rowh = _mm_xor_si64(rowh, rowhs); \
+ rowl = _mm_sub_pi16(rowl, rowls); \
+ rowh = _mm_sub_pi16(rowh, rowhs); \
+ \
+ _mm_store_si64((__m64 *)&output_ptr[0], rowl); \
+ _mm_store_si64((__m64 *)&output_ptr[4], rowh); \
+ \
+ workspace += DCTSIZE; \
+ divisors += DCTSIZE; \
+ output_ptr += DCTSIZE; \
+}
+
+
+void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
+ DCTELEM *workspace)
+{
+ JCOEFPTR output_ptr = coef_block;
+
+ DO_QUANT()
+ DO_QUANT()
+ DO_QUANT()
+ DO_QUANT()
+ DO_QUANT()
+ DO_QUANT()
+ DO_QUANT()
+ DO_QUANT()
+}
diff --git a/media/libjpeg/simd/mips64/jsimd.c b/media/libjpeg/simd/mips64/jsimd.c
new file mode 100644
index 0000000000..917440b43b
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jsimd.c
@@ -0,0 +1,866 @@
+/*
+ * jsimd_mips64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015, 2018, 2022, Matthieu Darbois.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <ctype.h>
+
+static THREAD_LOCAL unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+ char *p;
+
+ if (*feature == 0)
+ return 0;
+ if (strncmp(buffer, "ASEs implemented", 16) != 0)
+ return 0;
+ buffer += 16;
+ while (isspace(*buffer))
+ buffer++;
+
+ /* Check if 'feature' is present in the buffer as a separate word */
+ while ((p = strstr(buffer, feature))) {
+ if (p > buffer && !isspace(*(p - 1))) {
+ buffer++;
+ continue;
+ }
+ p += strlen(feature);
+ if (*p != 0 && !isspace(*p)) {
+ buffer++;
+ continue;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+ char *buffer = (char *)malloc(bufsize);
+ FILE *fd;
+
+ simd_support = 0;
+
+ if (!buffer)
+ return 0;
+
+ fd = fopen("/proc/cpuinfo", "r");
+ if (fd) {
+ while (fgets(buffer, bufsize, fd)) {
+ if (!strchr(buffer, '\n') && !feof(fd)) {
+ /* "impossible" happened - insufficient size of the buffer! */
+ fclose(fd);
+ free(buffer);
+ return 0;
+ }
+ if (check_feature(buffer, "loongson-mmi"))
+ simd_support |= JSIMD_MMI;
+ }
+ fclose(fd);
+ }
+ free(buffer);
+ return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char *env = NULL;
+#endif
+#if defined(__linux__)
+ int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+#if defined(__linux__)
+ while (!parse_proc_cpuinfo(bufsize)) {
+ bufsize *= 2;
+ if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+ break;
+ }
+#elif defined(__mips_loongson_vector_rev)
+ /* Only enable MMI by default on non-Linux platforms when the compiler flags
+ * support it. */
+ simd_support |= JSIMD_MMI;
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ env = getenv("JSIMD_FORCEMMI");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = JSIMD_MMI;
+ env = getenv("JSIMD_FORCENONE");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ mmifct = jsimd_extrgb_ycc_convert_mmi;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ mmifct = jsimd_extrgbx_ycc_convert_mmi;
+ break;
+ case JCS_EXT_BGR:
+ mmifct = jsimd_extbgr_ycc_convert_mmi;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ mmifct = jsimd_extbgrx_ycc_convert_mmi;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ mmifct = jsimd_extxbgr_ycc_convert_mmi;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ mmifct = jsimd_extxrgb_ycc_convert_mmi;
+ break;
+ default:
+ mmifct = jsimd_rgb_ycc_convert_mmi;
+ break;
+ }
+
+ mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ mmifct = jsimd_extrgb_gray_convert_mmi;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ mmifct = jsimd_extrgbx_gray_convert_mmi;
+ break;
+ case JCS_EXT_BGR:
+ mmifct = jsimd_extbgr_gray_convert_mmi;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ mmifct = jsimd_extbgrx_gray_convert_mmi;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ mmifct = jsimd_extxbgr_gray_convert_mmi;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ mmifct = jsimd_extxrgb_gray_convert_mmi;
+ break;
+ default:
+ mmifct = jsimd_rgb_gray_convert_mmi;
+ break;
+ }
+
+ mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ mmifct = jsimd_ycc_extrgb_convert_mmi;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ mmifct = jsimd_ycc_extrgbx_convert_mmi;
+ break;
+ case JCS_EXT_BGR:
+ mmifct = jsimd_ycc_extbgr_convert_mmi;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ mmifct = jsimd_ycc_extbgrx_convert_mmi;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ mmifct = jsimd_ycc_extxbgr_convert_mmi;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ mmifct = jsimd_ycc_extxrgb_convert_mmi;
+ break;
+ default:
+ mmifct = jsimd_ycc_rgb_convert_mmi;
+ break;
+ }
+
+ mmifct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_mmi(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ mmifct = jsimd_h2v2_extrgb_merged_upsample_mmi;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ mmifct = jsimd_h2v2_extrgbx_merged_upsample_mmi;
+ break;
+ case JCS_EXT_BGR:
+ mmifct = jsimd_h2v2_extbgr_merged_upsample_mmi;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ mmifct = jsimd_h2v2_extbgrx_merged_upsample_mmi;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ mmifct = jsimd_h2v2_extxbgr_merged_upsample_mmi;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ mmifct = jsimd_h2v2_extxrgb_merged_upsample_mmi;
+ break;
+ default:
+ mmifct = jsimd_h2v2_merged_upsample_mmi;
+ break;
+ }
+
+ mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ mmifct = jsimd_h2v1_extrgb_merged_upsample_mmi;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ mmifct = jsimd_h2v1_extrgbx_merged_upsample_mmi;
+ break;
+ case JCS_EXT_BGR:
+ mmifct = jsimd_h2v1_extbgr_merged_upsample_mmi;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ mmifct = jsimd_h2v1_extbgrx_merged_upsample_mmi;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ mmifct = jsimd_h2v1_extxbgr_merged_upsample_mmi;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ mmifct = jsimd_h2v1_extxrgb_merged_upsample_mmi;
+ break;
+ default:
+ mmifct = jsimd_h2v1_merged_upsample_mmi;
+ break;
+ }
+
+ mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_mmi(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if (simd_support & JSIMD_MMI)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
diff --git a/media/libjpeg/simd/mips64/jsimd_mmi.h b/media/libjpeg/simd/mips64/jsimd_mmi.h
new file mode 100644
index 0000000000..5e4261c9d9
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jsimd_mmi.h
@@ -0,0 +1,69 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Authors: ZhuChen <zhuchen@loongson.cn>
+ * CaiWanwei <caiwanwei@loongson.cn>
+ * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * QingfaLiu <liuqingfa-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jdct.h"
+#include "loongson-mmintrin.h"
+
+
+/* Common code */
+#if defined(_ABI64) && _MIPS_SIM == _ABI64
+# define PTR_ADDU "daddu "
+# define PTR_SLL "dsll "
+#else
+# define PTR_ADDU "addu "
+# define PTR_SLL "sll "
+#endif
+
+#define SIZEOF_MMWORD 8
+#define BYTE_BIT 8
+#define WORD_BIT 16
+#define SCALEBITS 16
+
+#define _uint64_set_pi8(a, b, c, d, e, f, g, h) \
+ (((uint64_t)(uint8_t)a << 56) | \
+ ((uint64_t)(uint8_t)b << 48) | \
+ ((uint64_t)(uint8_t)c << 40) | \
+ ((uint64_t)(uint8_t)d << 32) | \
+ ((uint64_t)(uint8_t)e << 24) | \
+ ((uint64_t)(uint8_t)f << 16) | \
+ ((uint64_t)(uint8_t)g << 8) | \
+ ((uint64_t)(uint8_t)h))
+#define _uint64_set1_pi8(a) _uint64_set_pi8(a, a, a, a, a, a, a, a)
+#define _uint64_set_pi16(a, b, c, d) \
+ (((uint64_t)(uint16_t)a << 48) | \
+ ((uint64_t)(uint16_t)b << 32) | \
+ ((uint64_t)(uint16_t)c << 16) | \
+ ((uint64_t)(uint16_t)d))
+#define _uint64_set1_pi16(a) _uint64_set_pi16(a, a, a, a)
+#define _uint64_set_pi32(a, b) \
+ (((uint64_t)(uint32_t)a << 32) | \
+ ((uint64_t)(uint32_t)b))
+
+#define get_const_value(index) (*(__m64 *)&const_value[index])
diff --git a/media/libjpeg/simd/mips64/loongson-mmintrin.h b/media/libjpeg/simd/mips64/loongson-mmintrin.h
new file mode 100644
index 0000000000..db9b35ab60
--- /dev/null
+++ b/media/libjpeg/simd/mips64/loongson-mmintrin.h
@@ -0,0 +1,1334 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * All Rights Reserved.
+ * Copyright (C) 2019, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#ifndef __LOONGSON_MMINTRIN_H__
+#define __LOONGSON_MMINTRIN_H__
+
+#include <stdint.h>
+
+
+#define FUNCTION_ATTRIBS \
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+
+/* Vectors are stored in 64-bit floating-point registers. */
+typedef double __m64;
+
+/* Having a 32-bit datatype allows us to use 32-bit loads in places like
+ load8888. */
+typedef float __m32;
+
+
+/********** Set Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setzero_si64(void)
+{
+ return 0.0;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
+ uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
+{
+ __m64 ret;
+ uint32_t lo = ((uint32_t)__b6 << 24) |
+ ((uint32_t)__b4 << 16) |
+ ((uint32_t)__b2 << 8) |
+ (uint32_t)__b0;
+ uint32_t hi = ((uint32_t)__b7 << 24) |
+ ((uint32_t)__b5 << 16) |
+ ((uint32_t)__b3 << 8) |
+ (uint32_t)__b1;
+
+ asm("mtc1 %1, %0\n\t"
+ "mtc1 %2, $f0\n\t"
+ "punpcklbh %0, %0, $f0\n\t"
+ : "=f" (ret)
+ : "r" (lo), "r" (hi)
+ : "$f0"
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
+{
+ __m64 ret;
+ uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
+ uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
+
+ asm("mtc1 %1, %0\n\t"
+ "mtc1 %2, $f0\n\t"
+ "punpcklhw %0, %0, $f0\n\t"
+ : "=f" (ret)
+ : "r" (lo), "r" (hi)
+ : "$f0"
+ );
+
+ return ret;
+}
+
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi32(uint32_t __i1, uint32_t __i0)
+{
+ if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
+ uint64_t val = ((uint64_t)__i1 << 32) |
+ ((uint64_t)__i0 << 0);
+
+ return *(__m64 *)&val;
+ } else if (__i1 == __i0) {
+ uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
+ __m64 ret;
+
+ asm("pshufh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
+ );
+
+ return ret;
+ } else {
+ uint64_t val = ((uint64_t)__i1 << 32) |
+ ((uint64_t)__i0 << 0);
+
+ return *(__m64 *)&val;
+ }
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi8(uint8_t __b0)
+{
+ __m64 ret;
+
+ asm("sll $8, %1, 8\n\t"
+ "or %1, %1, $8\n\t"
+ "mtc1 %1, %0\n\t"
+ "mtc1 $0, $f0\n\t"
+ "pshufh %0, %0, $f0\n\t"
+ : "=f" (ret)
+ : "r" (__b0)
+ : "$8", "$f0"
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi16(uint16_t __h0)
+{
+ __m64 ret;
+
+ asm("mtc1 %1, %0\n\t"
+ "mtc1 $0, $f0\n\t"
+ "pshufh %0, %0, $f0\n\t"
+ : "=f" (ret)
+ : "r" (__h0)
+ : "$8", "$f0"
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi32(unsigned __i0)
+{
+ return _mm_set_pi32(__i0, __i0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
+ uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
+{
+ return _mm_set_pi8(__h7, __h6, __h5, __h4,
+ __h3, __h2, __h1, __h0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
+{
+ return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi32(uint32_t __i0, uint32_t __i1)
+{
+ return _mm_set_pi32(__i1, __i0);
+}
+
+
+/********** Arithmetic Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_si64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddd %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddsb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddsh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddusb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("paddush %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pavgb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pavgh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmaddhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmaxsh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmaxub %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pminsh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pminub %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline int FUNCTION_ATTRIBS
+_mm_movemask_pi8(__m64 __m1)
+{
+ int ret;
+
+ asm("pmovmskb %0, %1\n\t"
+ : "=r" (ret)
+ : "y" (__m1)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmulhh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pu16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmulhuh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmullh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mul_pu32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pmuluw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sad_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psadbh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_asub_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pasubub %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_biadd_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("biadd %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_si64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubd %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubsb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubsh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubusb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("psubush %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+/********** Logical Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("and %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("andn %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si32(__m32 __m1, __m32 __m2)
+{
+ __m32 ret;
+
+ asm("or %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("or %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("xor %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+/********** Shift Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi16(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("psllh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi32(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("psllw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_si64(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("dsll %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi16(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("psrlh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi32(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("psrlw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_si64(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("dsrl %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi16(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("psrah %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi32(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("psraw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_si64(__m64 __m, int64_t __count)
+{
+ __m64 ret;
+
+ asm("dsra %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+
+ return ret;
+}
+
+
+/********** Conversion Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+to_m64(uint64_t x)
+{
+ return *(__m64 *)&x;
+}
+
+extern __inline uint64_t FUNCTION_ATTRIBS
+to_uint64(__m64 x)
+{
+ return *(uint64_t *)&x;
+}
+
+
+/********** Comparison Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpeqb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpeqh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpeqw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpgtb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpgth %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpgtw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpltb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmplth %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("pcmpltw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+/********** Miscellaneous Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("packsshb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("packsswh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32_f(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("packsswh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("packushb %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_extract_pi16(__m64 __m, int64_t __pos)
+{
+ __m64 ret;
+
+ asm("pextrh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__pos)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
+{
+ __m64 ret;
+
+ switch (__pos) {
+ case 0:
+
+ asm("pinsrh_0 %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2), "i" (__pos)
+ );
+
+ break;
+
+ case 1:
+
+ asm("pinsrh_1 %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2), "i" (__pos)
+ );
+
+ break;
+ case 2:
+
+ asm("pinsrh_2 %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2), "i" (__pos)
+ );
+
+ break;
+
+ case 3:
+
+ asm("pinsrh_3 %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2), "i" (__pos)
+ );
+
+ break;
+ }
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_shuffle_pi16(__m64 __m, int64_t __n)
+{
+ __m64 ret;
+
+ asm("pshufh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__n)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpckhbh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpckhbh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpckhhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpckhhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpckhwd %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklbh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+/* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
+ which preserves the data. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklbh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
+ datatype, which allows load8888 to use 32-bit loads. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklbh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklwd %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+
+ asm("punpcklwd %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+
+ return ret;
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_pi32(__m32 *dest, __m64 src)
+{
+ src = _mm_packs_pu16(src, _mm_setzero_si64());
+
+ asm("swc1 %1, %0\n\t"
+ : "=m" (*dest)
+ : "f" (src)
+ : "memory"
+ );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_si64(__m64 *dest, __m64 src)
+{
+ asm("sdc1 %1, %0 \n\t"
+ : "=m" (*dest)
+ : "f" (src)
+ : "memory"
+ );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_storeu_si64(__m64 *dest, __m64 src)
+{
+ asm("gssdlc1 %1, 7(%0) \n\t"
+ "gssdrc1 %1, 0(%0) \n\t"
+ :
+ : "r" (dest), "f" (src)
+ : "memory"
+ );
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si32(const __m32 *src)
+{
+ __m32 ret;
+
+ asm("lwc1 %0, %1\n\t"
+ : "=f" (ret)
+ : "m" (*src)
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si64(const __m64 *src)
+{
+ __m64 ret;
+
+ asm("ldc1 %0, %1\n\t"
+ : "=f" (ret)
+ : "m" (*src)
+ : "memory"
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadu_si64(const __m64 *src)
+{
+ __m64 ret;
+
+ asm("gsldlc1 %0, 7(%1)\n\t"
+ "gsldrc1 %0, 0(%1)\n\t"
+ : "=f" (ret)
+ : "r" (src)
+ : "memory"
+ );
+
+ return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8(const uint32_t *src)
+{
+ return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8_f(__m64 src)
+{
+ return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi8_f(__m64 src)
+{
+ return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16(__m64 src)
+{
+ return _mm_unpacklo_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16_f(__m64 src)
+{
+ return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16(__m64 src)
+{
+ return _mm_unpackhi_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16_f(__m64 src)
+{
+ return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha(__m64 pixel)
+{
+ return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha_rev(__m64 pixel)
+{
+ return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+#endif /* __LOONGSON_MMINTRIN_H__ */
diff --git a/media/libjpeg/simd/nasm/jcolsamp.inc b/media/libjpeg/simd/nasm/jcolsamp.inc
new file mode 100644
index 0000000000..6f6d7f29d1
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jcolsamp.inc
@@ -0,0 +1,135 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_RED == 0
+%define mmA mm0
+%define mmB mm1
+%define xmmA xmm0
+%define xmmB xmm1
+%define ymmA ymm0
+%define ymmB ymm1
+%elif RGB_GREEN == 0
+%define mmA mm2
+%define mmB mm3
+%define xmmA xmm2
+%define xmmB xmm3
+%define ymmA ymm2
+%define ymmB ymm3
+%elif RGB_BLUE == 0
+%define mmA mm4
+%define mmB mm5
+%define xmmA xmm4
+%define xmmB xmm5
+%define ymmA ymm4
+%define ymmB ymm5
+%else
+%define mmA mm6
+%define mmB mm7
+%define xmmA xmm6
+%define xmmB xmm7
+%define ymmA ymm6
+%define ymmB ymm7
+%endif
+
+%if RGB_RED == 1
+%define mmC mm0
+%define mmD mm1
+%define xmmC xmm0
+%define xmmD xmm1
+%define ymmC ymm0
+%define ymmD ymm1
+%elif RGB_GREEN == 1
+%define mmC mm2
+%define mmD mm3
+%define xmmC xmm2
+%define xmmD xmm3
+%define ymmC ymm2
+%define ymmD ymm3
+%elif RGB_BLUE == 1
+%define mmC mm4
+%define mmD mm5
+%define xmmC xmm4
+%define xmmD xmm5
+%define ymmC ymm4
+%define ymmD ymm5
+%else
+%define mmC mm6
+%define mmD mm7
+%define xmmC xmm6
+%define xmmD xmm7
+%define ymmC ymm6
+%define ymmD ymm7
+%endif
+
+%if RGB_RED == 2
+%define mmE mm0
+%define mmF mm1
+%define xmmE xmm0
+%define xmmF xmm1
+%define ymmE ymm0
+%define ymmF ymm1
+%elif RGB_GREEN == 2
+%define mmE mm2
+%define mmF mm3
+%define xmmE xmm2
+%define xmmF xmm3
+%define ymmE ymm2
+%define ymmF ymm3
+%elif RGB_BLUE == 2
+%define mmE mm4
+%define mmF mm5
+%define xmmE xmm4
+%define xmmF xmm5
+%define ymmE ymm4
+%define ymmF ymm5
+%else
+%define mmE mm6
+%define mmF mm7
+%define xmmE xmm6
+%define xmmF xmm7
+%define ymmE ymm6
+%define ymmF ymm7
+%endif
+
+%if RGB_RED == 3
+%define mmG mm0
+%define mmH mm1
+%define xmmG xmm0
+%define xmmH xmm1
+%define ymmG ymm0
+%define ymmH ymm1
+%elif RGB_GREEN == 3
+%define mmG mm2
+%define mmH mm3
+%define xmmG xmm2
+%define xmmH xmm3
+%define ymmG ymm2
+%define ymmH ymm3
+%elif RGB_BLUE == 3
+%define mmG mm4
+%define mmH mm5
+%define xmmG xmm4
+%define xmmH xmm5
+%define ymmG ymm4
+%define ymmH ymm5
+%else
+%define mmG mm6
+%define mmH mm7
+%define xmmG xmm6
+%define xmmH xmm7
+%define ymmG ymm6
+%define ymmH ymm7
+%endif
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/nasm/jdct.inc b/media/libjpeg/simd/nasm/jdct.inc
new file mode 100644
index 0000000000..9192f66f0c
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jdct.inc
@@ -0,0 +1,31 @@
+;
+; jdct.inc - private declarations for forward & reverse DCT subsystems
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2018, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+
+; Each IDCT routine is responsible for range-limiting its results and
+; converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could
+; be quite far out of range if the input data is corrupt, so a bulletproof
+; range-limiting step is required. We use a mask-and-table-lookup method
+; to do the combined operations quickly.
+;
+%define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples
+
+%define ROW(n, b, s) ((b) + (n) * (s))
+%define COL(n, b, s) ((b) + (n) * (s) * DCTSIZE)
+
+%define DWBLOCK(m, n, b, s) \
+ ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_DWORD)
+%define MMBLOCK(m, n, b, s) \
+ ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_MMWORD)
+%define XMMBLOCK(m, n, b, s) \
+ ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_XMMWORD)
+%define YMMBLOCK(m, n, b, s) \
+ ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_YMMWORD)
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/nasm/jsimdcfg.inc b/media/libjpeg/simd/nasm/jsimdcfg.inc
new file mode 100644
index 0000000000..667024a5f9
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdcfg.inc
@@ -0,0 +1,93 @@
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+;
+; -- jpeglib.h
+;
+%define DCTSIZE 8
+%define DCTSIZE2 64
+;
+; -- jmorecfg.h
+;
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define EXT_RGB_RED 0
+%define EXT_RGB_GREEN 1
+%define EXT_RGB_BLUE 2
+%define EXT_RGB_PIXELSIZE 3
+%define EXT_RGBX_RED 0
+%define EXT_RGBX_GREEN 1
+%define EXT_RGBX_BLUE 2
+%define EXT_RGBX_PIXELSIZE 4
+%define EXT_BGR_RED 2
+%define EXT_BGR_GREEN 1
+%define EXT_BGR_BLUE 0
+%define EXT_BGR_PIXELSIZE 3
+%define EXT_BGRX_RED 2
+%define EXT_BGRX_GREEN 1
+%define EXT_BGRX_BLUE 0
+%define EXT_BGRX_PIXELSIZE 4
+%define EXT_XBGR_RED 3
+%define EXT_XBGR_GREEN 2
+%define EXT_XBGR_BLUE 1
+%define EXT_XBGR_PIXELSIZE 4
+%define EXT_XRGB_RED 1
+%define EXT_XRGB_GREEN 2
+%define EXT_XRGB_BLUE 3
+%define EXT_XRGB_PIXELSIZE 4
+%define RGBX_FILLER_0XFF 1
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+%define JSAMPLE byte ; unsigned char
+%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
+%define CENTERJSAMPLE 128
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF word ; short
+%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION dword ; unsigned int
+%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
+%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
+%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
+%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
+%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
+%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
+;
+; -- jdct.h
+;
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM word ; short
+%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
+%define float FP32 ; float
+%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)
+; To maximize parallelism, Type short is changed to short.
+;
+%define ISLOW_MULT_TYPE word ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
+%define IFAST_MULT_TYPE word ; must be short
+%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
+%define FLOAT_MULT_TYPE FP32 ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
+;
+; -- jsimd.h
+;
+%define JSIMD_NONE 0x00
+%define JSIMD_MMX 0x01
+%define JSIMD_3DNOW 0x02
+%define JSIMD_SSE 0x04
+%define JSIMD_SSE2 0x08
+%define JSIMD_AVX2 0x80
diff --git a/media/libjpeg/simd/nasm/jsimdcfg.inc.h b/media/libjpeg/simd/nasm/jsimdcfg.inc.h
new file mode 100644
index 0000000000..bf2a45ad50
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdcfg.inc.h
@@ -0,0 +1,133 @@
+/*
+ * This file generates the include file for the assembly
+ * implementations by abusing the C preprocessor.
+ *
+ * Note: Some things are manually defined as they need to
+ * be mapped to NASM types.
+ */
+
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+
+#define JPEG_INTERNALS
+
+#include "../jpeglib.h"
+#include "../jconfig.h"
+#include "../jmorecfg.h"
+#include "jsimd.h"
+
+;
+; -- jpeglib.h
+;
+
+%define _cpp_protection_DCTSIZE DCTSIZE
+%define _cpp_protection_DCTSIZE2 DCTSIZE2
+
+;
+; -- jmorecfg.h
+;
+
+%define _cpp_protection_RGB_RED RGB_RED
+%define _cpp_protection_RGB_GREEN RGB_GREEN
+%define _cpp_protection_RGB_BLUE RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
+
+%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
+
+%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
+
+%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
+
+%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+
+%define RGBX_FILLER_0XFF 1
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+
+%define JSAMPLE byte ; unsigned char
+%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
+
+%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF word ; short
+%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION dword ; unsigned int
+%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
+
+%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
+%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
+%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
+%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
+%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
+
+;
+; -- jdct.h
+;
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM word ; short
+%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
+
+%define FAST_FLOAT FP32 ; float
+%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define ISLOW_MULT_TYPE word ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
+
+%define IFAST_MULT_TYPE word ; must be short
+%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
+
+%define FLOAT_MULT_TYPE FP32 ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
+
+;
+; -- jsimd.h
+;
+
+%define _cpp_protection_JSIMD_NONE JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
+%define _cpp_protection_JSIMD_AVX2 JSIMD_AVX2
diff --git a/media/libjpeg/simd/nasm/jsimdext.inc b/media/libjpeg/simd/nasm/jsimdext.inc
new file mode 100644
index 0000000000..e8d50b0349
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdext.inc
@@ -0,0 +1,520 @@
+;
+; jsimdext.inc - common declarations
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
+; Copyright (C) 2018, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+; ==========================================================================
+; System-dependent configurations
+
+%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT .text align=32
+%define SEG_CONST .rdata align=32
+%else
+%define SEG_TEXT .text align=32 public use32 class=CODE
+%define SEG_CONST .rdata align=32 public use32 class=CONST
+%endif
+
+%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
+; * Microsoft Visual C++
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT .text align=32
+%define SEG_CONST .rdata align=32
+%else
+%define SEG_TEXT .text align=32 public use64 class=CODE
+%define SEG_CONST .rdata align=32 public use64 class=CONST
+%endif
+%define EXTN(name) name ; foo() -> foo
+
+%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT _text align=32 public use32 class=CODE
+%define SEG_CONST _data align=32 public use32 class=DATA
+
+%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; mark stack as non-executable
+section .note.GNU-stack noalloc noexec nowrite progbits
+
+; -- segment definition --
+;
+%ifdef __x86_64__
+%define SEG_TEXT .text progbits align=32
+%define SEG_CONST .rodata progbits align=32
+%else
+%define SEG_TEXT .text progbits alloc exec nowrite align=32
+%define SEG_CONST .rodata progbits alloc noexec nowrite align=32
+%endif
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
+%define EXTN(name) name ; foo() -> foo
+
+%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT .text
+%define SEG_CONST .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
+
+%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why?
+%define SEG_CONST .rodata align=32
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
+
+%else ; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT .text
+%define SEG_CONST .data
+
+%endif ; ----------------------------------------------
+
+; ==========================================================================
+
+; --------------------------------------------------------------------------
+; Common types
+;
+%ifdef __x86_64__
+%ifnidn __OUTPUT_FORMAT__, elfx32
+%define POINTER qword ; general pointer type
+%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
+%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
+%define resp resq
+%define dp dq
+%define raxp rax
+%define rbxp rbx
+%define rcxp rcx
+%define rdxp rdx
+%define rsip rsi
+%define rdip rdi
+%define rbpp rbp
+%define rspp rsp
+%define r8p r8
+%define r9p r9
+%define r10p r10
+%define r11p r11
+%define r12p r12
+%define r13p r13
+%define r14p r14
+%define r15p r15
+%endif
+%endif
+%ifndef raxp
+%define POINTER dword ; general pointer type
+%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
+%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
+%define resp resd
+%define dp dd
+; x86_64 ILP32 ABI (x32)
+%define raxp eax
+%define rbxp ebx
+%define rcxp ecx
+%define rdxp edx
+%define rsip esi
+%define rdip edi
+%define rbpp ebp
+%define rspp esp
+%define r8p r8d
+%define r9p r9d
+%define r10p r10d
+%define r11p r11d
+%define r12p r12d
+%define r13p r13d
+%define r14p r14d
+%define r15p r15d
+%endif
+
+%define INT dword ; signed integer type
+%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
+%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
+
+%define FP32 dword ; IEEE754 single
+%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
+%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
+
+%define MMWORD qword ; int64 (MMX register)
+%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
+%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
+
+; NASM is buggy and doesn't properly handle operand sizes for SSE
+; instructions, so for now we have to define XMMWORD as blank.
+%define XMMWORD ; int128 (SSE register)
+%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
+%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
+
+%define YMMWORD ; int256 (AVX register)
+%define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD)
+%define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT
+
+; Similar hacks for when we load a dword or MMWORD into an xmm# register
+%define XMM_DWORD
+%define XMM_MMWORD
+
+%define SIZEOF_BYTE 1 ; sizeof(byte)
+%define SIZEOF_WORD 2 ; sizeof(word)
+%define SIZEOF_DWORD 4 ; sizeof(dword)
+%define SIZEOF_QWORD 8 ; sizeof(qword)
+%define SIZEOF_OWORD 16 ; sizeof(oword)
+%define SIZEOF_YWORD 32 ; sizeof(yword)
+
+%define BYTE_BIT 8 ; CHAR_BIT in C
+%define WORD_BIT 16 ; sizeof(word)*BYTE_BIT
+%define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT
+%define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT
+%define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT
+%define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT
+
+; --------------------------------------------------------------------------
+; External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name) _ %+ name ; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+; Hidden symbols
+;
+%ifdef ELF ; ----(nasm -felf[64] -DELF ...)--------
+%define GLOBAL_FUNCTION(name) global EXTN(name):function hidden
+%define GLOBAL_DATA(name) global EXTN(name):data hidden
+%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
+%ifdef __YASM_VER__
+%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
+%define GLOBAL_DATA(name) global EXTN(name):private_extern
+%else
+%if __NASM_VERSION_ID__ >= 0x020E0000
+%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
+%define GLOBAL_DATA(name) global EXTN(name):private_extern
+%endif
+%endif
+%endif
+
+%ifndef GLOBAL_FUNCTION
+%define GLOBAL_FUNCTION(name) global EXTN(name)
+%endif
+%ifndef GLOBAL_DATA
+%define GLOBAL_DATA(name) global EXTN(name)
+%endif
+
+; --------------------------------------------------------------------------
+; Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC ; -------------------------------------------
+
+%ifidn GOT_SYMBOL, _MACHO_PIC_ ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+ SECTION SEG_CONST
+const_base:
+
+%define GOTOFF(got, sym) (got) + (sym) - const_base
+
+%imacro get_GOT 1
+ ; NOTE: this macro destroys ecx resister.
+ call %%geteip
+ add ecx, byte (%%ref - $)
+ jmp short %%adjust
+%%geteip:
+ mov ecx, POINTER [esp]
+ ret
+%%adjust:
+ push ebp
+ xor ebp, ebp ; ebp = 0
+%ifidni %1, ebx ; (%1 == ebx)
+ ; db 0x8D,0x9C + jmp near const_base =
+ ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+ db 0x8D, 0x9C ; 8D,9C
+ jmp near const_base ; E9,(const_base-%%ref)
+%%ref:
+%else ; (%1 != ebx)
+ ; db 0x8D,0x8C + jmp near const_base =
+ ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+ db 0x8D, 0x8C ; 8D,8C
+ jmp near const_base ; E9,(const_base-%%ref)
+%%ref:
+ mov %1, ecx
+%endif ; (%1 == ebx)
+ pop ebp
+%endmacro
+
+%else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT 1
+ extern GOT_SYMBOL
+ call %%geteip
+ add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+ jmp short %%done
+%%geteip:
+ mov %1, POINTER [esp]
+ ret
+%%done:
+%endmacro
+
+%endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic 1.nolist
+ push %1
+%endmacro
+%imacro poppic 1.nolist
+ pop %1
+%endmacro
+%imacro movpic 2.nolist
+ mov %1, %2
+%endmacro
+
+%else ; !PIC -----------------------------------------
+
+%define GOTOFF(got, sym) (sym)
+
+%imacro get_GOT 1.nolist
+%endmacro
+%imacro pushpic 1.nolist
+%endmacro
+%imacro poppic 1.nolist
+%endmacro
+%imacro movpic 2.nolist
+%endmacro
+
+%endif ; PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+; Align the next instruction on {2,4,8,16,..}-byte boundary.
+; ".balign n,,m" in GNU as
+;
+%define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b, n) (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs: \
+ times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
+ db 0x90 ; nop
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
+ db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
+ db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
+ db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
+ db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
+ db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
+ db 0x8B, 0xED ; mov ebp,ebp
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
+ db 0x90 ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+ align %1, db 0 ; filling zeros
+%endmacro
+
+%ifdef __x86_64__
+
+%ifdef WIN64
+
+%imacro collect_args 1
+ sub rsp, SIZEOF_XMMWORD
+ movaps XMMWORD [rsp], xmm6
+ sub rsp, SIZEOF_XMMWORD
+ movaps XMMWORD [rsp], xmm7
+ mov r10, rcx
+%if %1 > 1
+ mov r11, rdx
+%endif
+%if %1 > 2
+ push r12
+ mov r12, r8
+%endif
+%if %1 > 3
+ push r13
+ mov r13, r9
+%endif
+%if %1 > 4
+ push r14
+ mov r14, [rax+48]
+%endif
+%if %1 > 5
+ push r15
+ mov r15, [rax+56]
+%endif
+ push rsi
+ push rdi
+%endmacro
+
+%imacro uncollect_args 1
+ pop rdi
+ pop rsi
+%if %1 > 5
+ pop r15
+%endif
+%if %1 > 4
+ pop r14
+%endif
+%if %1 > 3
+ pop r13
+%endif
+%if %1 > 2
+ pop r12
+%endif
+ movaps xmm7, XMMWORD [rsp]
+ add rsp, SIZEOF_XMMWORD
+ movaps xmm6, XMMWORD [rsp]
+ add rsp, SIZEOF_XMMWORD
+%endmacro
+
+%imacro push_xmm 1
+ sub rsp, %1 * SIZEOF_XMMWORD
+ movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
+%if %1 > 1
+ movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
+%endif
+%if %1 > 2
+ movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
+%endif
+%if %1 > 3
+ movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
+%endif
+%endmacro
+
+%imacro pop_xmm 1
+ movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+%if %1 > 1
+ movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+%endif
+%if %1 > 2
+ movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+%endif
+%if %1 > 3
+ movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
+%endif
+ add rsp, %1 * SIZEOF_XMMWORD
+%endmacro
+
+%else
+
+%imacro collect_args 1
+ push r10
+ mov r10, rdi
+%if %1 > 1
+ push r11
+ mov r11, rsi
+%endif
+%if %1 > 2
+ push r12
+ mov r12, rdx
+%endif
+%if %1 > 3
+ push r13
+ mov r13, rcx
+%endif
+%if %1 > 4
+ push r14
+ mov r14, r8
+%endif
+%if %1 > 5
+ push r15
+ mov r15, r9
+%endif
+%endmacro
+
+%imacro uncollect_args 1
+%if %1 > 5
+ pop r15
+%endif
+%if %1 > 4
+ pop r14
+%endif
+%if %1 > 3
+ pop r13
+%endif
+%if %1 > 2
+ pop r12
+%endif
+%if %1 > 1
+ pop r11
+%endif
+ pop r10
+%endmacro
+
+%imacro push_xmm 1
+%endmacro
+
+%imacro pop_xmm 1
+%endmacro
+
+%endif
+
+%endif
+
+; --------------------------------------------------------------------------
+; Defines picked up from the C headers
+;
+%include "jsimdcfg.inc"
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/powerpc/jccolext-altivec.c b/media/libjpeg/simd/powerpc/jccolext-altivec.c
new file mode 100644
index 0000000000..170f90ff80
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jccolext-altivec.c
@@ -0,0 +1,269 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2014, Jay Foad. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-altivec.c */
+
+
+void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ JSAMPROW inptr, outptr0, outptr1, outptr2;
+ int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+ int offset;
+#endif
+ unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+ __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
+ rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+ __vector unsigned char rgb3 = { 0 };
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
+ __vector unsigned char rgb4 = { 0 };
+#endif
+ __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+ __vector unsigned short yl, yh, crl, crh, cbl, cbh;
+ __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
+
+ /* Constants */
+ __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+ pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
+ pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
+ pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
+ __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
+ __vector int pd_onehalf = { __4X(ONE_HALF) },
+ pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
+ __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+ shift_pack_index =
+ { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+#else
+ shift_pack_index =
+ { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+#endif
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ for (num_cols = pitch; num_cols > 0;
+ num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+ outptr0 += 16, outptr1 += 16, outptr2 += 16) {
+
+#if __BIG_ENDIAN__
+ /* Load 16 pixels == 48 or 64 bytes */
+ offset = (size_t)inptr & 15;
+ if (offset) {
+ __vector unsigned char unaligned_shift_index;
+ int bytes = num_cols + offset;
+
+ if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+ /* Slow path to prevent buffer overread. Since there is no way to
+ * read a partial AltiVec register, overread would occur on the last
+ * chunk of the last image row if the right edge is not on a 16-byte
+ * boundary. It could also occur on other rows if the bytes per row
+ * is low enough. Since we can't determine whether we're on the last
+ * image row, we have to assume every row is the last.
+ */
+ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+ rgb0 = vec_ld(0, tmpbuf);
+ rgb1 = vec_ld(16, tmpbuf);
+ rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ rgb3 = vec_ld(48, tmpbuf);
+#endif
+ } else {
+ /* Fast path */
+ rgb0 = vec_ld(0, inptr);
+ if (bytes > 16)
+ rgb1 = vec_ld(16, inptr);
+ if (bytes > 32)
+ rgb2 = vec_ld(32, inptr);
+ if (bytes > 48)
+ rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+ if (bytes > 64)
+ rgb4 = vec_ld(64, inptr);
+#endif
+ unaligned_shift_index = vec_lvsl(0, inptr);
+ rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+ rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+ rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+ rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
+ }
+ } else {
+#endif /* __BIG_ENDIAN__ */
+ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+ /* Slow path */
+ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+ rgb0 = VEC_LD(0, tmpbuf);
+ rgb1 = VEC_LD(16, tmpbuf);
+ rgb2 = VEC_LD(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ rgb3 = VEC_LD(48, tmpbuf);
+#endif
+ } else {
+ /* Fast path */
+ rgb0 = VEC_LD(0, inptr);
+ if (num_cols > 16)
+ rgb1 = VEC_LD(16, inptr);
+ if (num_cols > 32)
+ rgb2 = VEC_LD(32, inptr);
+#if RGB_PIXELSIZE == 4
+ if (num_cols > 48)
+ rgb3 = VEC_LD(48, inptr);
+#endif
+ }
+#if __BIG_ENDIAN__
+ }
+#endif
+
+#if RGB_PIXELSIZE == 3
+ /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+ * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+ * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+ *
+ * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+ * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+ * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+ * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+ */
+ rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+ rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+ rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+ rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+ /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+ * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+ * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+ * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+ *
+ * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+ * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+ * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+ * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+ */
+ rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+ rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+ rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+ rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+ /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+ * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+ * ...
+ *
+ * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+ * support unsigned vectors.
+ */
+ rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+ bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+ rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+ bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+ rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+ bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+ rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+ bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
+
+ /* (Original)
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ *
+ * (This implementation)
+ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ */
+
+ /* Calculate Y values */
+
+ y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+ y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+ y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+ y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+ y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+ y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+ y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+ y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+ /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
+ * each dword into a new 16-bit vector, which is the equivalent of
+ * descaling the 32-bit results (right-shifting by 16 bits) and then
+ * packing them.
+ */
+ yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+ shift_pack_index);
+ yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+ shift_pack_index);
+ y = vec_pack(yl, yh);
+ vec_st(y, 0, outptr0);
+
+ /* Calculate Cb values */
+ cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
+ cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
+ cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
+ cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
+ cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
+ (__vector unsigned int)cb0);
+ cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
+ (__vector unsigned int)cb1);
+ cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
+ (__vector unsigned int)cb2);
+ cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
+ (__vector unsigned int)cb3);
+ cbl = vec_perm((__vector unsigned short)cb0,
+ (__vector unsigned short)cb1, shift_pack_index);
+ cbh = vec_perm((__vector unsigned short)cb2,
+ (__vector unsigned short)cb3, shift_pack_index);
+ cb = vec_pack(cbl, cbh);
+ vec_st(cb, 0, outptr1);
+
+ /* Calculate Cr values */
+ cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
+ cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
+ cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
+ cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
+ cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
+ (__vector unsigned int)cr0);
+ cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
+ (__vector unsigned int)cr1);
+ cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
+ (__vector unsigned int)cr2);
+ cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
+ (__vector unsigned int)cr3);
+ crl = vec_perm((__vector unsigned short)cr0,
+ (__vector unsigned short)cr1, shift_pack_index);
+ crh = vec_perm((__vector unsigned short)cr2,
+ (__vector unsigned short)cr3, shift_pack_index);
+ cr = vec_pack(crl, crh);
+ vec_st(cr, 0, outptr2);
+ }
+ }
+}
diff --git a/media/libjpeg/simd/powerpc/jccolor-altivec.c b/media/libjpeg/simd/powerpc/jccolor-altivec.c
new file mode 100644
index 0000000000..d670dbcda3
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jccolor-altivec.c
@@ -0,0 +1,116 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_081 5329 /* FIX(0.08131) */
+#define F_0_114 7471 /* FIX(0.11400) */
+#define F_0_168 11059 /* FIX(0.16874) */
+#define F_0_250 16384 /* FIX(0.25000) */
+#define F_0_299 19595 /* FIX(0.29900) */
+#define F_0_331 21709 /* FIX(0.33126) */
+#define F_0_418 27439 /* FIX(0.41869) */
+#define F_0_500 32768 /* FIX(0.50000) */
+#define F_0_587 38470 /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 \
+ { 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 }
+#define RGBG_INDEX1 \
+ { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+ { 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+ { 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 }
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX \
+ { 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 }
+#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 \
+ { 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 }
+#define RGBG_INDEX1 \
+ { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+ { 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+ { 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 }
+#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX \
+ { 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 }
+#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX \
+ { 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 }
+#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX \
+ { 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 }
+#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
diff --git a/media/libjpeg/simd/powerpc/jcgray-altivec.c b/media/libjpeg/simd/powerpc/jcgray-altivec.c
new file mode 100644
index 0000000000..a11a7e7021
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcgray-altivec.c
@@ -0,0 +1,111 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_114 7471 /* FIX(0.11400) */
+#define F_0_250 16384 /* FIX(0.25000) */
+#define F_0_299 19595 /* FIX(0.29900) */
+#define F_0_587 38470 /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 \
+ { 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 }
+#define RGBG_INDEX1 \
+ { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+ { 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+ { 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 }
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX \
+ { 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 }
+#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 \
+ { 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 }
+#define RGBG_INDEX1 \
+ { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+ { 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+ { 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 }
+#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX \
+ { 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 }
+#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX \
+ { 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 }
+#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX \
+ { 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 }
+#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
diff --git a/media/libjpeg/simd/powerpc/jcgryext-altivec.c b/media/libjpeg/simd/powerpc/jcgryext-altivec.c
new file mode 100644
index 0000000000..b280cbbded
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcgryext-altivec.c
@@ -0,0 +1,228 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2014, Jay Foad. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-altivec.c */
+
+
+void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ JSAMPROW inptr, outptr;
+ int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+ int offset;
+ unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+#endif
+
+ __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
+ rgbg0, rgbg1, rgbg2, rgbg3, y;
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+ __vector unsigned char rgb3 = { 0 };
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
+ __vector unsigned char rgb4 = { 0 };
+#endif
+ __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+ __vector unsigned short yl, yh;
+ __vector int y0, y1, y2, y3;
+
+ /* Constants */
+ __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+ pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
+ __vector int pd_onehalf = { __4X(ONE_HALF) };
+ __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+ shift_pack_index =
+ { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+#else
+ shift_pack_index =
+ { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+#endif
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr = output_buf[0][output_row];
+ output_row++;
+
+ for (num_cols = pitch; num_cols > 0;
+ num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+ outptr += 16) {
+
+#if __BIG_ENDIAN__
+ /* Load 16 pixels == 48 or 64 bytes */
+ offset = (size_t)inptr & 15;
+ if (offset) {
+ __vector unsigned char unaligned_shift_index;
+ int bytes = num_cols + offset;
+
+ if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+ /* Slow path to prevent buffer overread. Since there is no way to
+ * read a partial AltiVec register, overread would occur on the last
+ * chunk of the last image row if the right edge is not on a 16-byte
+ * boundary. It could also occur on other rows if the bytes per row
+ * is low enough. Since we can't determine whether we're on the last
+ * image row, we have to assume every row is the last.
+ */
+ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+ rgb0 = vec_ld(0, tmpbuf);
+ rgb1 = vec_ld(16, tmpbuf);
+ rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ rgb3 = vec_ld(48, tmpbuf);
+#endif
+ } else {
+ /* Fast path */
+ rgb0 = vec_ld(0, inptr);
+ if (bytes > 16)
+ rgb1 = vec_ld(16, inptr);
+ if (bytes > 32)
+ rgb2 = vec_ld(32, inptr);
+ if (bytes > 48)
+ rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+ if (bytes > 64)
+ rgb4 = vec_ld(64, inptr);
+#endif
+ unaligned_shift_index = vec_lvsl(0, inptr);
+ rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+ rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+ rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+ rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
+ }
+ } else {
+ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+ /* Slow path */
+ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+ rgb0 = vec_ld(0, tmpbuf);
+ rgb1 = vec_ld(16, tmpbuf);
+ rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ rgb3 = vec_ld(48, tmpbuf);
+#endif
+ } else {
+ /* Fast path */
+ rgb0 = vec_ld(0, inptr);
+ if (num_cols > 16)
+ rgb1 = vec_ld(16, inptr);
+ if (num_cols > 32)
+ rgb2 = vec_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+ if (num_cols > 48)
+ rgb3 = vec_ld(48, inptr);
+#endif
+ }
+ }
+#else
+ /* Little endian */
+ rgb0 = vec_vsx_ld(0, inptr);
+ if (num_cols > 16)
+ rgb1 = vec_vsx_ld(16, inptr);
+ if (num_cols > 32)
+ rgb2 = vec_vsx_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+ if (num_cols > 48)
+ rgb3 = vec_vsx_ld(48, inptr);
+#endif
+#endif
+
+#if RGB_PIXELSIZE == 3
+ /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+ * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+ * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+ *
+ * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+ * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+ * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+ * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+ */
+ rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+ rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+ rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+ rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+ /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+ * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+ * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+ * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+ *
+ * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+ * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+ * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+ * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+ */
+ rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+ rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+ rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+ rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+ /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+ * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+ * ...
+ *
+ * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+ * support unsigned vectors.
+ */
+ rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+ bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+ rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+ bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+ rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+ bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+ rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+ bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
+
+ /* (Original)
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * (This implementation)
+ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ */
+
+ /* Calculate Y values */
+
+ y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+ y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+ y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+ y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+ y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+ y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+ y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+ y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+ /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
+ * each dword into a new 16-bit vector, which is the equivalent of
+ * descaling the 32-bit results (right-shifting by 16 bits) and then
+ * packing them.
+ */
+ yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+ shift_pack_index);
+ yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+ shift_pack_index);
+ y = vec_pack(yl, yh);
+ vec_st(y, 0, outptr);
+ }
+ }
+}
diff --git a/media/libjpeg/simd/powerpc/jcsample-altivec.c b/media/libjpeg/simd/powerpc/jcsample-altivec.c
new file mode 100644
index 0000000000..6e25b8db90
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcsample-altivec.c
@@ -0,0 +1,159 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_altivec.h"
+#include "jcsample.h"
+
+
+void jsimd_h2v1_downsample_altivec(JDIMENSION image_width,
+ int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data)
+{
+ int outrow, outcol;
+ JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+ JSAMPROW inptr, outptr;
+
+ __vector unsigned char this0, next0, out;
+ __vector unsigned short this0e, this0o, next0e, next0o, outl, outh;
+
+ /* Constants */
+ __vector unsigned short pw_bias = { __4X2(0, 1) },
+ pw_one = { __8X(1) };
+ __vector unsigned char even_odd_index =
+ { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+ pb_zero = { __16X(0) };
+
+ expand_right_edge(input_data, max_v_samp_factor, image_width,
+ output_cols * 2);
+
+ for (outrow = 0; outrow < v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr = input_data[outrow];
+
+ for (outcol = output_cols; outcol > 0;
+ outcol -= 16, inptr += 32, outptr += 16) {
+
+ this0 = vec_ld(0, inptr);
+ this0 = vec_perm(this0, this0, even_odd_index);
+ this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+ this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
+ outl = vec_add(this0e, this0o);
+ outl = vec_add(outl, pw_bias);
+ outl = vec_sr(outl, pw_one);
+
+ if (outcol > 8) {
+ next0 = vec_ld(16, inptr);
+ next0 = vec_perm(next0, next0, even_odd_index);
+ next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+ next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
+ outh = vec_add(next0e, next0o);
+ outh = vec_add(outh, pw_bias);
+ outh = vec_sr(outh, pw_one);
+ } else
+ outh = vec_splat_u16(0);
+
+ out = vec_pack(outl, outh);
+ vec_st(out, 0, outptr);
+ }
+ }
+}
+
+
+void
+jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ int inrow, outrow, outcol;
+ JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+ JSAMPROW inptr0, inptr1, outptr;
+
+ __vector unsigned char this0, next0, this1, next1, out;
+ __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o,
+ next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
+
+ /* Constants */
+ __vector unsigned short pw_bias = { __4X2(1, 2) },
+ pw_two = { __8X(2) };
+ __vector unsigned char even_odd_index =
+ { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+ pb_zero = { __16X(0) };
+
+ expand_right_edge(input_data, max_v_samp_factor, image_width,
+ output_cols * 2);
+
+ for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+ inrow += 2, outrow++) {
+
+ inptr0 = input_data[inrow];
+ inptr1 = input_data[inrow + 1];
+ outptr = output_data[outrow];
+
+ for (outcol = output_cols; outcol > 0;
+ outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) {
+
+ this0 = vec_ld(0, inptr0);
+ this0 = vec_perm(this0, this0, even_odd_index);
+ this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+ this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
+ out0l = vec_add(this0e, this0o);
+
+ this1 = vec_ld(0, inptr1);
+ this1 = vec_perm(this1, this1, even_odd_index);
+ this1e = (__vector unsigned short)VEC_UNPACKHU(this1);
+ this1o = (__vector unsigned short)VEC_UNPACKLU(this1);
+ out1l = vec_add(this1e, this1o);
+
+ outl = vec_add(out0l, out1l);
+ outl = vec_add(outl, pw_bias);
+ outl = vec_sr(outl, pw_two);
+
+ if (outcol > 8) {
+ next0 = vec_ld(16, inptr0);
+ next0 = vec_perm(next0, next0, even_odd_index);
+ next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+ next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
+ out0h = vec_add(next0e, next0o);
+
+ next1 = vec_ld(16, inptr1);
+ next1 = vec_perm(next1, next1, even_odd_index);
+ next1e = (__vector unsigned short)VEC_UNPACKHU(next1);
+ next1o = (__vector unsigned short)VEC_UNPACKLU(next1);
+ out1h = vec_add(next1e, next1o);
+
+ outh = vec_add(out0h, out1h);
+ outh = vec_add(outh, pw_bias);
+ outh = vec_sr(outh, pw_two);
+ } else
+ outh = vec_splat_u16(0);
+
+ out = vec_pack(outl, outh);
+ vec_st(out, 0, outptr);
+ }
+ }
+}
diff --git a/media/libjpeg/simd/powerpc/jcsample.h b/media/libjpeg/simd/powerpc/jcsample.h
new file mode 100644
index 0000000000..bd07fcc4ed
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+ JDIMENSION output_cols)
+{
+ register JSAMPROW ptr;
+ register JSAMPLE pixval;
+ register int count;
+ int row;
+ int numcols = (int)(output_cols - input_cols);
+
+ if (numcols > 0) {
+ for (row = 0; row < num_rows; row++) {
+ ptr = image_data[row] + input_cols;
+ pixval = ptr[-1];
+ for (count = numcols; count > 0; count--)
+ *ptr++ = pixval;
+ }
+ }
+}
diff --git a/media/libjpeg/simd/powerpc/jdcolext-altivec.c b/media/libjpeg/simd/powerpc/jdcolext-altivec.c
new file mode 100644
index 0000000000..68d52bd8a2
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdcolext-altivec.c
@@ -0,0 +1,276 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-altivec.c */
+
+
+void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ JSAMPROW outptr, inptr0, inptr1, inptr2;
+ int pitch = out_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+ int offset;
+#endif
+ unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+ __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
+ y, cb, cr;
+#if __BIG_ENDIAN__
+ __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+ __vector unsigned char out4;
+#endif
+#endif
+#if RGB_PIXELSIZE == 4
+ __vector unsigned char rgb3;
+#endif
+ __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh,
+ crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
+ __vector int g0, g1, g2, g3;
+
+ /* Constants
+ * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+ * high-order bits, not 16.
+ */
+ __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
+ pw_mf0228 = { __8X(-F_0_228 >> 1) },
+ pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
+ pw_one = { __8X(1) }, pw_255 = { __8X(255) },
+ pw_cj = { __8X(CENTERJSAMPLE) };
+ __vector int pd_onehalf = { __4X(ONE_HALF) };
+ __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+ shift_pack_index =
+ { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+#else
+ shift_pack_index =
+ { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+#endif
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+
+ for (num_cols = pitch; num_cols > 0;
+ num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16,
+ inptr0 += 16, inptr1 += 16, inptr2 += 16) {
+
+ y = vec_ld(0, inptr0);
+ /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+ * support unsigned vectors.
+ */
+ yl = (__vector signed short)VEC_UNPACKHU(y);
+ yh = (__vector signed short)VEC_UNPACKLU(y);
+
+ cb = vec_ld(0, inptr1);
+ cbl = (__vector signed short)VEC_UNPACKHU(cb);
+ cbh = (__vector signed short)VEC_UNPACKLU(cb);
+ cbl = vec_sub(cbl, pw_cj);
+ cbh = vec_sub(cbh, pw_cj);
+
+ cr = vec_ld(0, inptr2);
+ crl = (__vector signed short)VEC_UNPACKHU(cr);
+ crh = (__vector signed short)VEC_UNPACKLU(cr);
+ crl = vec_sub(crl, pw_cj);
+ crh = vec_sub(crh, pw_cj);
+
+ /* (Original)
+ * R = Y + 1.40200 * Cr
+ * G = Y - 0.34414 * Cb - 0.71414 * Cr
+ * B = Y + 1.77200 * Cb
+ *
+ * (This implementation)
+ * R = Y + 0.40200 * Cr + Cr
+ * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ * B = Y - 0.22800 * Cb + Cb + Cb
+ */
+ bl = vec_add(cbl, cbl);
+ bh = vec_add(cbh, cbh);
+ bl = vec_madds(bl, pw_mf0228, pw_one);
+ bh = vec_madds(bh, pw_mf0228, pw_one);
+ bl = vec_sra(bl, (__vector unsigned short)pw_one);
+ bh = vec_sra(bh, (__vector unsigned short)pw_one);
+ bl = vec_add(bl, cbl);
+ bh = vec_add(bh, cbh);
+ bl = vec_add(bl, cbl);
+ bh = vec_add(bh, cbh);
+ bl = vec_add(bl, yl);
+ bh = vec_add(bh, yh);
+
+ rl = vec_add(crl, crl);
+ rh = vec_add(crh, crh);
+ rl = vec_madds(rl, pw_f0402, pw_one);
+ rh = vec_madds(rh, pw_f0402, pw_one);
+ rl = vec_sra(rl, (__vector unsigned short)pw_one);
+ rh = vec_sra(rh, (__vector unsigned short)pw_one);
+ rl = vec_add(rl, crl);
+ rh = vec_add(rh, crh);
+ rl = vec_add(rl, yl);
+ rh = vec_add(rh, yh);
+
+ g0w = vec_mergeh(cbl, crl);
+ g1w = vec_mergel(cbl, crl);
+ g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf);
+ g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf);
+ g2w = vec_mergeh(cbh, crh);
+ g3w = vec_mergel(cbh, crh);
+ g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf);
+ g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf);
+ /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
+ * each dword into a new 16-bit vector, which is the equivalent of
+ * descaling the 32-bit results (right-shifting by 16 bits) and then
+ * packing them.
+ */
+ gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index);
+ gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index);
+ gl = vec_sub(gl, crl);
+ gh = vec_sub(gh, crh);
+ gl = vec_add(gl, yl);
+ gh = vec_add(gh, yh);
+
+ rg0 = vec_mergeh(rl, gl);
+ bx0 = vec_mergeh(bl, pw_255);
+ rg1 = vec_mergel(rl, gl);
+ bx1 = vec_mergel(bl, pw_255);
+ rg2 = vec_mergeh(rh, gh);
+ bx2 = vec_mergeh(bh, pw_255);
+ rg3 = vec_mergel(rh, gh);
+ bx3 = vec_mergel(bh, pw_255);
+
+ rgbx0 = vec_packsu(rg0, bx0);
+ rgbx1 = vec_packsu(rg1, bx1);
+ rgbx2 = vec_packsu(rg2, bx2);
+ rgbx3 = vec_packsu(rg3, bx3);
+
+#if RGB_PIXELSIZE == 3
+ /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+ * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+ * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+ * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+ *
+ * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+ * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+ * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+ */
+ rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
+ rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
+ rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
+#else
+ /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+ * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+ * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+ * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+ *
+ * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+ * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+ * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+ * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+ */
+ rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
+ rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
+ rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
+ rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
+#endif
+
+#if __BIG_ENDIAN__
+ offset = (size_t)outptr & 15;
+ if (offset) {
+ __vector unsigned char unaligned_shift_index;
+ int bytes = num_cols + offset;
+
+ if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+ /* Slow path to prevent buffer overwrite. Since there is no way to
+ * write a partial AltiVec register, overwrite would occur on the
+ * last chunk of the last image row if the right edge is not on a
+ * 16-byte boundary. It could also occur on other rows if the bytes
+ * per row is low enough. Since we can't determine whether we're on
+ * the last image row, we have to assume every row is the last.
+ */
+ vec_st(rgb0, 0, tmpbuf);
+ vec_st(rgb1, 16, tmpbuf);
+ vec_st(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ vec_st(rgb3, 48, tmpbuf);
+#endif
+ memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+ } else {
+ /* Fast path */
+ unaligned_shift_index = vec_lvsl(0, outptr);
+ edgel = vec_ld(0, outptr);
+ edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
+ edges = vec_perm(edgeh, edgel, unaligned_shift_index);
+ unaligned_shift_index = vec_lvsr(0, outptr);
+ out0 = vec_perm(edges, rgb0, unaligned_shift_index);
+ out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+ out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+ out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+ out4 = vec_perm(rgb3, edges, unaligned_shift_index);
+#else
+ out3 = vec_perm(rgb2, edges, unaligned_shift_index);
+#endif
+ vec_st(out0, 0, outptr);
+ if (bytes > 16)
+ vec_st(out1, 16, outptr);
+ if (bytes > 32)
+ vec_st(out2, 32, outptr);
+ if (bytes > 48)
+ vec_st(out3, 48, outptr);
+#if RGB_PIXELSIZE == 4
+ if (bytes > 64)
+ vec_st(out4, 64, outptr);
+#endif
+ }
+ } else {
+#endif /* __BIG_ENDIAN__ */
+ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+ /* Slow path */
+ VEC_ST(rgb0, 0, tmpbuf);
+ VEC_ST(rgb1, 16, tmpbuf);
+ VEC_ST(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ VEC_ST(rgb3, 48, tmpbuf);
+#endif
+ memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+ } else {
+ /* Fast path */
+ VEC_ST(rgb0, 0, outptr);
+ if (num_cols > 16)
+ VEC_ST(rgb1, 16, outptr);
+ if (num_cols > 32)
+ VEC_ST(rgb2, 32, outptr);
+#if RGB_PIXELSIZE == 4
+ if (num_cols > 48)
+ VEC_ST(rgb3, 48, outptr);
+#endif
+ }
+#if __BIG_ENDIAN__
+ }
+#endif
+ }
+ }
+}
diff --git a/media/libjpeg/simd/powerpc/jdcolor-altivec.c b/media/libjpeg/simd/powerpc/jdcolor-altivec.c
new file mode 100644
index 0000000000..eb35b67176
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdcolor-altivec.c
@@ -0,0 +1,106 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344 22554 /* FIX(0.34414) */
+#define F_0_714 46802 /* FIX(0.71414) */
+#define F_1_402 91881 /* FIX(1.40200) */
+#define F_1_772 116130 /* FIX(1.77200) */
+#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
+#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
+#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 \
+ { 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+ { 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+ { 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGB_INDEX \
+ { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 \
+ { 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+ { 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+ { 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGB_INDEX \
+ { 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 }
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGB_INDEX \
+ { 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 }
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGB_INDEX \
+ { 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 }
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
diff --git a/media/libjpeg/simd/powerpc/jdmerge-altivec.c b/media/libjpeg/simd/powerpc/jdmerge-altivec.c
new file mode 100644
index 0000000000..79c577f141
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdmerge-altivec.c
@@ -0,0 +1,130 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344 22554 /* FIX(0.34414) */
+#define F_0_714 46802 /* FIX(0.71414) */
+#define F_1_402 91881 /* FIX(1.40200) */
+#define F_1_772 116130 /* FIX(1.77200) */
+#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
+#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
+#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 \
+ { 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+ { 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+ { 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_altivec \
+ jsimd_h2v1_extrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+ jsimd_h2v2_extrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGB_INDEX \
+ { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }
+#define jsimd_h2v1_merged_upsample_altivec \
+ jsimd_h2v1_extrgbx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+ jsimd_h2v2_extrgbx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 \
+ { 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+ { 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+ { 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
+#define jsimd_h2v1_merged_upsample_altivec \
+ jsimd_h2v1_extbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+ jsimd_h2v2_extbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGB_INDEX \
+ { 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 }
+#define jsimd_h2v1_merged_upsample_altivec \
+ jsimd_h2v1_extbgrx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+ jsimd_h2v2_extbgrx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGB_INDEX \
+ { 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 }
+#define jsimd_h2v1_merged_upsample_altivec \
+ jsimd_h2v1_extxbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+ jsimd_h2v2_extxbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGB_INDEX \
+ { 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 }
+#define jsimd_h2v1_merged_upsample_altivec \
+ jsimd_h2v1_extxrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+ jsimd_h2v2_extxrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/media/libjpeg/simd/powerpc/jdmrgext-altivec.c b/media/libjpeg/simd/powerpc/jdmrgext-altivec.c
new file mode 100644
index 0000000000..40f02c33ea
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdmrgext-altivec.c
@@ -0,0 +1,329 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-altivec.c */
+
+
+void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW outptr, inptr0, inptr1, inptr2;
+ int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
+#if __BIG_ENDIAN__
+ int offset;
+#endif
+ unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+ __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
+ y, cb, cr;
+#if __BIG_ENDIAN__
+ __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+ __vector unsigned char out4;
+#endif
+#endif
+#if RGB_PIXELSIZE == 4
+ __vector unsigned char rgb3;
+#endif
+ __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
+ crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
+ rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo;
+ __vector int g_y0, g_y1, g_y2, g_y3;
+
+ /* Constants
+ * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+ * high-order bits, not 16.
+ */
+ __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
+ pw_mf0228 = { __8X(-F_0_228 >> 1) },
+ pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
+ pw_one = { __8X(1) }, pw_255 = { __8X(255) },
+ pw_cj = { __8X(CENTERJSAMPLE) };
+ __vector int pd_onehalf = { __4X(ONE_HALF) };
+ __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+ shift_pack_index =
+ { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+ even_index =
+ { 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30 },
+ odd_index =
+ { 0, 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31 };
+#else
+ shift_pack_index =
+ { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+ even_index =
+ { 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30, 0 },
+ odd_index =
+ { 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31, 0 };
+#endif
+
+ inptr0 = input_buf[0][in_row_group_ctr];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) {
+
+ cb = vec_ld(0, inptr1);
+ /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+ * support unsigned vectors.
+ */
+ cbl = (__vector signed short)VEC_UNPACKHU(cb);
+ cbh = (__vector signed short)VEC_UNPACKLU(cb);
+ cbl = vec_sub(cbl, pw_cj);
+ cbh = vec_sub(cbh, pw_cj);
+
+ cr = vec_ld(0, inptr2);
+ crl = (__vector signed short)VEC_UNPACKHU(cr);
+ crh = (__vector signed short)VEC_UNPACKLU(cr);
+ crl = vec_sub(crl, pw_cj);
+ crh = vec_sub(crh, pw_cj);
+
+ /* (Original)
+ * R = Y + 1.40200 * Cr
+ * G = Y - 0.34414 * Cb - 0.71414 * Cr
+ * B = Y + 1.77200 * Cb
+ *
+ * (This implementation)
+ * R = Y + 0.40200 * Cr + Cr
+ * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ * B = Y - 0.22800 * Cb + Cb + Cb
+ */
+ b_yl = vec_add(cbl, cbl);
+ b_yh = vec_add(cbh, cbh);
+ b_yl = vec_madds(b_yl, pw_mf0228, pw_one);
+ b_yh = vec_madds(b_yh, pw_mf0228, pw_one);
+ b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one);
+ b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one);
+ b_yl = vec_add(b_yl, cbl);
+ b_yh = vec_add(b_yh, cbh);
+ b_yl = vec_add(b_yl, cbl);
+ b_yh = vec_add(b_yh, cbh);
+
+ r_yl = vec_add(crl, crl);
+ r_yh = vec_add(crh, crh);
+ r_yl = vec_madds(r_yl, pw_f0402, pw_one);
+ r_yh = vec_madds(r_yh, pw_f0402, pw_one);
+ r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one);
+ r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one);
+ r_yl = vec_add(r_yl, crl);
+ r_yh = vec_add(r_yh, crh);
+
+ g_y0w = vec_mergeh(cbl, crl);
+ g_y1w = vec_mergel(cbl, crl);
+ g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf);
+ g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf);
+ g_y2w = vec_mergeh(cbh, crh);
+ g_y3w = vec_mergel(cbh, crh);
+ g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf);
+ g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf);
+ /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
+ * each dword into a new 16-bit vector, which is the equivalent of
+ * descaling the 32-bit results (right-shifting by 16 bits) and then
+ * packing them.
+ */
+ g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1,
+ shift_pack_index);
+ g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3,
+ shift_pack_index);
+ g_yl = vec_sub(g_yl, crl);
+ g_yh = vec_sub(g_yh, crh);
+
+ for (yloop = 0; yloop < 2 && num_cols > 0; yloop++,
+ num_cols -= RGB_PIXELSIZE * 16,
+ outptr += RGB_PIXELSIZE * 16, inptr0 += 16) {
+
+ y = vec_ld(0, inptr0);
+ ye = (__vector signed short)vec_perm(pb_zero, y, even_index);
+ yo = (__vector signed short)vec_perm(pb_zero, y, odd_index);
+
+ if (yloop == 0) {
+ be = vec_add(b_yl, ye);
+ bo = vec_add(b_yl, yo);
+ re = vec_add(r_yl, ye);
+ ro = vec_add(r_yl, yo);
+ ge = vec_add(g_yl, ye);
+ go = vec_add(g_yl, yo);
+ } else {
+ be = vec_add(b_yh, ye);
+ bo = vec_add(b_yh, yo);
+ re = vec_add(r_yh, ye);
+ ro = vec_add(r_yh, yo);
+ ge = vec_add(g_yh, ye);
+ go = vec_add(g_yh, yo);
+ }
+
+ rl = vec_mergeh(re, ro);
+ rh = vec_mergel(re, ro);
+ gl = vec_mergeh(ge, go);
+ gh = vec_mergel(ge, go);
+ bl = vec_mergeh(be, bo);
+ bh = vec_mergel(be, bo);
+
+ rg0 = vec_mergeh(rl, gl);
+ bx0 = vec_mergeh(bl, pw_255);
+ rg1 = vec_mergel(rl, gl);
+ bx1 = vec_mergel(bl, pw_255);
+ rg2 = vec_mergeh(rh, gh);
+ bx2 = vec_mergeh(bh, pw_255);
+ rg3 = vec_mergel(rh, gh);
+ bx3 = vec_mergel(bh, pw_255);
+
+ rgbx0 = vec_packsu(rg0, bx0);
+ rgbx1 = vec_packsu(rg1, bx1);
+ rgbx2 = vec_packsu(rg2, bx2);
+ rgbx3 = vec_packsu(rg3, bx3);
+
+#if RGB_PIXELSIZE == 3
+ /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+ * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+ * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+ * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+ *
+ * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+ * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+ * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+ */
+ rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
+ rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
+ rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
+#else
+ /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+ * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+ * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+ * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+ *
+ * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+ * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+ * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+ * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+ */
+ rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
+ rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
+ rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
+ rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
+#endif
+
+#if __BIG_ENDIAN__
+ offset = (size_t)outptr & 15;
+ if (offset) {
+ __vector unsigned char unaligned_shift_index;
+ int bytes = num_cols + offset;
+
+ if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+ /* Slow path to prevent buffer overwrite. Since there is no way to
+ * write a partial AltiVec register, overwrite would occur on the
+ * last chunk of the last image row if the right edge is not on a
+ * 16-byte boundary. It could also occur on other rows if the bytes
+ * per row is low enough. Since we can't determine whether we're on
+ * the last image row, we have to assume every row is the last.
+ */
+ vec_st(rgb0, 0, tmpbuf);
+ vec_st(rgb1, 16, tmpbuf);
+ vec_st(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ vec_st(rgb3, 48, tmpbuf);
+#endif
+ memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+ } else {
+ /* Fast path */
+ unaligned_shift_index = vec_lvsl(0, outptr);
+ edgel = vec_ld(0, outptr);
+ edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
+ edges = vec_perm(edgeh, edgel, unaligned_shift_index);
+ unaligned_shift_index = vec_lvsr(0, outptr);
+ out0 = vec_perm(edges, rgb0, unaligned_shift_index);
+ out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+ out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+ out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+ out4 = vec_perm(rgb3, edges, unaligned_shift_index);
+#else
+ out3 = vec_perm(rgb2, edges, unaligned_shift_index);
+#endif
+ vec_st(out0, 0, outptr);
+ if (bytes > 16)
+ vec_st(out1, 16, outptr);
+ if (bytes > 32)
+ vec_st(out2, 32, outptr);
+ if (bytes > 48)
+ vec_st(out3, 48, outptr);
+#if RGB_PIXELSIZE == 4
+ if (bytes > 64)
+ vec_st(out4, 64, outptr);
+#endif
+ }
+ } else {
+#endif /* __BIG_ENDIAN__ */
+ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+ /* Slow path */
+ VEC_ST(rgb0, 0, tmpbuf);
+ VEC_ST(rgb1, 16, tmpbuf);
+ VEC_ST(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+ VEC_ST(rgb3, 48, tmpbuf);
+#endif
+ memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+ } else {
+ /* Fast path */
+ VEC_ST(rgb0, 0, outptr);
+ if (num_cols > 16)
+ VEC_ST(rgb1, 16, outptr);
+ if (num_cols > 32)
+ VEC_ST(rgb2, 32, outptr);
+#if RGB_PIXELSIZE == 4
+ if (num_cols > 48)
+ VEC_ST(rgb3, 48, outptr);
+#endif
+ }
+#if __BIG_ENDIAN__
+ }
+#endif
+ }
+ }
+}
+
+
+void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW inptr, outptr;
+
+ inptr = input_buf[0][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+ jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
+ output_buf);
+
+ input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+ output_buf[0] = output_buf[1];
+ jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
+ output_buf);
+
+ input_buf[0][in_row_group_ctr] = inptr;
+ output_buf[0] = outptr;
+}
diff --git a/media/libjpeg/simd/powerpc/jdsample-altivec.c b/media/libjpeg/simd/powerpc/jdsample-altivec.c
new file mode 100644
index 0000000000..04df0cf108
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdsample-altivec.c
@@ -0,0 +1,400 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow, incol;
+
+ __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
+ out;
+ __vector short this0e, this0o, this0l, this0h, last0l, last0h,
+ next0l, next0h, outle, outhe, outlo, outho;
+
+ /* Constants */
+ __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
+ last_index_col0 =
+ { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+ last_index =
+ { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
+ next_index =
+ { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 },
+ next_index_lastcol =
+ { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 },
+#if __BIG_ENDIAN__
+ merge_pack_index =
+ { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
+#else
+ merge_pack_index =
+ { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
+#endif
+ __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+
+ if (downsampled_width & 15)
+ inptr[downsampled_width] = inptr[downsampled_width - 1];
+
+ this0 = vec_ld(0, inptr);
+ p_last0 = vec_perm(this0, this0, last_index_col0);
+ last0 = this0;
+
+ for (incol = downsampled_width; incol > 0;
+ incol -= 16, inptr += 16, outptr += 32) {
+
+ if (downsampled_width - incol > 0) {
+ p_last0 = vec_perm(last0, this0, last_index);
+ last0 = this0;
+ }
+
+ if (incol <= 16)
+ p_next0 = vec_perm(this0, this0, next_index_lastcol);
+ else {
+ next0 = vec_ld(16, inptr);
+ p_next0 = vec_perm(this0, next0, next_index);
+ }
+
+ this0e = (__vector short)vec_mule(this0, pb_three);
+ this0o = (__vector short)vec_mulo(this0, pb_three);
+ this0l = vec_mergeh(this0e, this0o);
+ this0h = vec_mergel(this0e, this0o);
+
+ last0l = (__vector short)VEC_UNPACKHU(p_last0);
+ last0h = (__vector short)VEC_UNPACKLU(p_last0);
+ last0l = vec_add(last0l, pw_one);
+
+ next0l = (__vector short)VEC_UNPACKHU(p_next0);
+ next0h = (__vector short)VEC_UNPACKLU(p_next0);
+ next0l = vec_add(next0l, pw_two);
+
+ outle = vec_add(this0l, last0l);
+ outlo = vec_add(this0l, next0l);
+ outle = vec_sr(outle, (__vector unsigned short)pw_two);
+ outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
+
+ out = vec_perm((__vector unsigned char)outle,
+ (__vector unsigned char)outlo, merge_pack_index);
+ vec_st(out, 0, outptr);
+
+ if (incol > 8) {
+ last0h = vec_add(last0h, pw_one);
+ next0h = vec_add(next0h, pw_two);
+
+ outhe = vec_add(this0h, last0h);
+ outho = vec_add(this0h, next0h);
+ outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
+ outho = vec_sr(outho, (__vector unsigned short)pw_two);
+
+ out = vec_perm((__vector unsigned char)outhe,
+ (__vector unsigned char)outho, merge_pack_index);
+ vec_st(out, 16, outptr);
+ }
+
+ this0 = next0;
+ }
+ }
+}
+
+
+void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+ int inrow, outrow, incol;
+
+ __vector unsigned char this_1, this0, this1, out;
+ __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
+ lastcolsum_1h, lastcolsum1h,
+ p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
+ thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
+ nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
+ nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
+ p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
+ tmpl, tmph, outle, outhe, outlo, outho;
+
+ /* Constants */
+ __vector unsigned char pb_zero = { __16X(0) },
+ last_index_col0 =
+ { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+ last_index =
+ { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
+ next_index =
+ { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 },
+ next_index_lastcol =
+ { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15 },
+#if __BIG_ENDIAN__
+ merge_pack_index =
+ { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
+#else
+ merge_pack_index =
+ { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
+#endif
+ __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
+ pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
+ __vector unsigned short pw_four = { __8X(4) };
+
+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+ inptr_1 = input_data[inrow - 1];
+ inptr0 = input_data[inrow];
+ inptr1 = input_data[inrow + 1];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ if (downsampled_width & 15) {
+ inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
+ inptr0[downsampled_width] = inptr0[downsampled_width - 1];
+ inptr1[downsampled_width] = inptr1[downsampled_width - 1];
+ }
+
+ this0 = vec_ld(0, inptr0);
+ this0l = (__vector short)VEC_UNPACKHU(this0);
+ this0h = (__vector short)VEC_UNPACKLU(this0);
+ this0l = vec_mladd(this0l, pw_three, pw_zero);
+ this0h = vec_mladd(this0h, pw_three, pw_zero);
+
+ this_1 = vec_ld(0, inptr_1);
+ this_1l = (__vector short)VEC_UNPACKHU(this_1);
+ this_1h = (__vector short)VEC_UNPACKLU(this_1);
+ thiscolsum_1l = vec_add(this0l, this_1l);
+ thiscolsum_1h = vec_add(this0h, this_1h);
+ lastcolsum_1h = thiscolsum_1h;
+ p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
+ p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
+
+ this1 = vec_ld(0, inptr1);
+ this1l = (__vector short)VEC_UNPACKHU(this1);
+ this1h = (__vector short)VEC_UNPACKLU(this1);
+ thiscolsum1l = vec_add(this0l, this1l);
+ thiscolsum1h = vec_add(this0h, this1h);
+ lastcolsum1h = thiscolsum1h;
+ p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
+ p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
+
+ for (incol = downsampled_width; incol > 0;
+ incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
+ outptr0 += 32, outptr1 += 32) {
+
+ if (downsampled_width - incol > 0) {
+ p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
+ p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
+ p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
+ p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
+ lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;
+ }
+
+ if (incol <= 16) {
+ p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
+ p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
+ next_index_lastcol);
+ p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
+ p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
+ next_index_lastcol);
+ } else {
+ this0 = vec_ld(16, inptr0);
+ this0l = (__vector short)VEC_UNPACKHU(this0);
+ this0h = (__vector short)VEC_UNPACKLU(this0);
+ this0l = vec_mladd(this0l, pw_three, pw_zero);
+ this0h = vec_mladd(this0h, pw_three, pw_zero);
+
+ this_1 = vec_ld(16, inptr_1);
+ this_1l = (__vector short)VEC_UNPACKHU(this_1);
+ this_1h = (__vector short)VEC_UNPACKLU(this_1);
+ nextcolsum_1l = vec_add(this0l, this_1l);
+ nextcolsum_1h = vec_add(this0h, this_1h);
+ p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
+ p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
+
+ this1 = vec_ld(16, inptr1);
+ this1l = (__vector short)VEC_UNPACKHU(this1);
+ this1h = (__vector short)VEC_UNPACKLU(this1);
+ nextcolsum1l = vec_add(this0l, this1l);
+ nextcolsum1h = vec_add(this0h, this1h);
+ p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
+ p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
+ }
+
+ /* Process the upper row */
+
+ tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
+ outle = vec_add(tmpl, p_lastcolsum_1l);
+ outle = vec_add(outle, pw_eight);
+ outle = vec_sr(outle, pw_four);
+
+ outlo = vec_add(tmpl, p_nextcolsum_1l);
+ outlo = vec_add(outlo, pw_seven);
+ outlo = vec_sr(outlo, pw_four);
+
+ out = vec_perm((__vector unsigned char)outle,
+ (__vector unsigned char)outlo, merge_pack_index);
+ vec_st(out, 0, outptr0);
+
+ if (incol > 8) {
+ tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
+ outhe = vec_add(tmph, p_lastcolsum_1h);
+ outhe = vec_add(outhe, pw_eight);
+ outhe = vec_sr(outhe, pw_four);
+
+ outho = vec_add(tmph, p_nextcolsum_1h);
+ outho = vec_add(outho, pw_seven);
+ outho = vec_sr(outho, pw_four);
+
+ out = vec_perm((__vector unsigned char)outhe,
+ (__vector unsigned char)outho, merge_pack_index);
+ vec_st(out, 16, outptr0);
+ }
+
+ /* Process the lower row */
+
+ tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
+ outle = vec_add(tmpl, p_lastcolsum1l);
+ outle = vec_add(outle, pw_eight);
+ outle = vec_sr(outle, pw_four);
+
+ outlo = vec_add(tmpl, p_nextcolsum1l);
+ outlo = vec_add(outlo, pw_seven);
+ outlo = vec_sr(outlo, pw_four);
+
+ out = vec_perm((__vector unsigned char)outle,
+ (__vector unsigned char)outlo, merge_pack_index);
+ vec_st(out, 0, outptr1);
+
+ if (incol > 8) {
+ tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
+ outhe = vec_add(tmph, p_lastcolsum1h);
+ outhe = vec_add(outhe, pw_eight);
+ outhe = vec_sr(outhe, pw_four);
+
+ outho = vec_add(tmph, p_nextcolsum1h);
+ outho = vec_add(outho, pw_seven);
+ outho = vec_sr(outho, pw_four);
+
+ out = vec_perm((__vector unsigned char)outhe,
+ (__vector unsigned char)outho, merge_pack_index);
+ vec_st(out, 16, outptr1);
+ }
+
+ thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
+ thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;
+ }
+ }
+}
+
+
+/* These are rarely used (mainly just for decompressing YCCK images) */
+
+void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
+ JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow, incol;
+
+ __vector unsigned char in, inl, inh;
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+
+ for (incol = (output_width + 31) & (~31); incol > 0;
+ incol -= 64, inptr += 32, outptr += 64) {
+
+ in = vec_ld(0, inptr);
+ inl = vec_mergeh(in, in);
+ inh = vec_mergel(in, in);
+
+ vec_st(inl, 0, outptr);
+ vec_st(inh, 16, outptr);
+
+ if (incol > 32) {
+ in = vec_ld(16, inptr);
+ inl = vec_mergeh(in, in);
+ inh = vec_mergel(in, in);
+
+ vec_st(inl, 32, outptr);
+ vec_st(inh, 48, outptr);
+ }
+ }
+ }
+}
+
+
+void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
+ JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr0, outptr1;
+ int inrow, outrow, incol;
+
+ __vector unsigned char in, inl, inh;
+
+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+ inptr = input_data[inrow];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ for (incol = (output_width + 31) & (~31); incol > 0;
+ incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
+
+ in = vec_ld(0, inptr);
+ inl = vec_mergeh(in, in);
+ inh = vec_mergel(in, in);
+
+ vec_st(inl, 0, outptr0);
+ vec_st(inl, 0, outptr1);
+
+ vec_st(inh, 16, outptr0);
+ vec_st(inh, 16, outptr1);
+
+ if (incol > 32) {
+ in = vec_ld(16, inptr);
+ inl = vec_mergeh(in, in);
+ inh = vec_mergel(in, in);
+
+ vec_st(inl, 32, outptr0);
+ vec_st(inl, 32, outptr1);
+
+ vec_st(inh, 48, outptr0);
+ vec_st(inh, 48, outptr1);
+ }
+ }
+ }
+}
diff --git a/media/libjpeg/simd/powerpc/jfdctfst-altivec.c b/media/libjpeg/simd/powerpc/jfdctfst-altivec.c
new file mode 100644
index 0000000000..ad9af81e0c
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jfdctfst-altivec.c
@@ -0,0 +1,154 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ * the elements in arg3 + the most significant 17 bits of
+ * (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_382 98 /* FIX(0.382683433) */
+#define F_0_541 139 /* FIX(0.541196100) */
+#define F_0_707 181 /* FIX(0.707106781) */
+#define F_1_306 334 /* FIX(1.306562965) */
+
+#define CONST_BITS 8
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_FDCT() { \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out4 = vec_sub(tmp10, tmp11); \
+ \
+ z1 = vec_add(tmp12, tmp13); \
+ z1 = vec_sl(z1, pre_multiply_scale_bits); \
+ z1 = vec_madds(z1, pw_0707, pw_zero); \
+ \
+ out2 = vec_add(tmp13, z1); \
+ out6 = vec_sub(tmp13, z1); \
+ \
+ /* Odd part */ \
+ \
+ tmp10 = vec_add(tmp4, tmp5); \
+ tmp11 = vec_add(tmp5, tmp6); \
+ tmp12 = vec_add(tmp6, tmp7); \
+ \
+ tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
+ tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+ z5 = vec_sub(tmp10, tmp12); \
+ z5 = vec_madds(z5, pw_0382, pw_zero); \
+ \
+ z2 = vec_madds(tmp10, pw_0541, z5); \
+ z4 = vec_madds(tmp12, pw_1306, z5); \
+ \
+ tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+ z3 = vec_madds(tmp11, pw_0707, pw_zero); \
+ \
+ z11 = vec_add(tmp7, z3); \
+ z13 = vec_sub(tmp7, z3); \
+ \
+ out5 = vec_add(z13, z2); \
+ out3 = vec_sub(z13, z2); \
+ out1 = vec_add(z11, z4); \
+ out7 = vec_sub(z11, z4); \
+}
+
+
+void jsimd_fdct_ifast_altivec(DCTELEM *data)
+{
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ z1, z2, z3, z4, z5, z11, z13,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+
+ /* Constants */
+ __vector short pw_zero = { __8X(0) },
+ pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
+ pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
+ pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
+ pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
+ __vector unsigned short
+ pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
+
+ /* Pass 1: process rows */
+
+ row0 = vec_ld(0, data);
+ row1 = vec_ld(16, data);
+ row2 = vec_ld(32, data);
+ row3 = vec_ld(48, data);
+ row4 = vec_ld(64, data);
+ row5 = vec_ld(80, data);
+ row6 = vec_ld(96, data);
+ row7 = vec_ld(112, data);
+
+ TRANSPOSE(row, col);
+
+ tmp0 = vec_add(col0, col7);
+ tmp7 = vec_sub(col0, col7);
+ tmp1 = vec_add(col1, col6);
+ tmp6 = vec_sub(col1, col6);
+ tmp2 = vec_add(col2, col5);
+ tmp5 = vec_sub(col2, col5);
+ tmp3 = vec_add(col3, col4);
+ tmp4 = vec_sub(col3, col4);
+
+ DO_FDCT();
+
+ /* Pass 2: process columns */
+
+ TRANSPOSE(out, row);
+
+ tmp0 = vec_add(row0, row7);
+ tmp7 = vec_sub(row0, row7);
+ tmp1 = vec_add(row1, row6);
+ tmp6 = vec_sub(row1, row6);
+ tmp2 = vec_add(row2, row5);
+ tmp5 = vec_sub(row2, row5);
+ tmp3 = vec_add(row3, row4);
+ tmp4 = vec_sub(row3, row4);
+
+ DO_FDCT();
+
+ vec_st(out0, 0, data);
+ vec_st(out1, 16, data);
+ vec_st(out2, 32, data);
+ vec_st(out3, 48, data);
+ vec_st(out4, 64, data);
+ vec_st(out5, 80, data);
+ vec_st(out6, 96, data);
+ vec_st(out7, 112, data);
+}
diff --git a/media/libjpeg/simd/powerpc/jfdctint-altivec.c b/media/libjpeg/simd/powerpc/jfdctint-altivec.c
new file mode 100644
index 0000000000..3d4f017103
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jfdctint-altivec.c
@@ -0,0 +1,258 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+ /* (Original) \
+ * z1 = (tmp12 + tmp13) * 0.541196100; \
+ * data2 = z1 + tmp13 * 0.765366865; \
+ * data6 = z1 + tmp12 * -1.847759065; \
+ * \
+ * (This implementation) \
+ * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+ * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+ */ \
+ \
+ tmp1312l = vec_mergeh(tmp13, tmp12); \
+ tmp1312h = vec_mergel(tmp13, tmp12); \
+ \
+ out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
+ out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
+ out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
+ out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
+ \
+ out2l = vec_sra(out2l, descale_p##PASS); \
+ out2h = vec_sra(out2h, descale_p##PASS); \
+ out6l = vec_sra(out6l, descale_p##PASS); \
+ out6h = vec_sra(out6h, descale_p##PASS); \
+ \
+ out2 = vec_pack(out2l, out2h); \
+ out6 = vec_pack(out6l, out6h); \
+ \
+ /* Odd part */ \
+ \
+ z3 = vec_add(tmp4, tmp6); \
+ z4 = vec_add(tmp5, tmp7); \
+ \
+ /* (Original) \
+ * z5 = (z3 + z4) * 1.175875602; \
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
+ * z3 += z5; z4 += z5; \
+ * \
+ * (This implementation) \
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+ */ \
+ \
+ z34l = vec_mergeh(z3, z4); \
+ z34h = vec_mergel(z3, z4); \
+ \
+ z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
+ z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
+ z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
+ z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
+ \
+ /* (Original) \
+ * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
+ * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
+ * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
+ * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \
+ * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \
+ * \
+ * (This implementation) \
+ * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+ * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+ * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+ * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+ * data7 = tmp4 + z3; data5 = tmp5 + z4; \
+ * data3 = tmp6 + z3; data1 = tmp7 + z4; \
+ */ \
+ \
+ tmp47l = vec_mergeh(tmp4, tmp7); \
+ tmp47h = vec_mergel(tmp4, tmp7); \
+ \
+ out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
+ out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
+ out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
+ out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
+ \
+ out7l = vec_sra(out7l, descale_p##PASS); \
+ out7h = vec_sra(out7h, descale_p##PASS); \
+ out1l = vec_sra(out1l, descale_p##PASS); \
+ out1h = vec_sra(out1h, descale_p##PASS); \
+ \
+ out7 = vec_pack(out7l, out7h); \
+ out1 = vec_pack(out1l, out1h); \
+ \
+ tmp56l = vec_mergeh(tmp5, tmp6); \
+ tmp56h = vec_mergel(tmp5, tmp6); \
+ \
+ out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
+ out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
+ out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
+ out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
+ \
+ out5l = vec_sra(out5l, descale_p##PASS); \
+ out5h = vec_sra(out5h, descale_p##PASS); \
+ out3l = vec_sra(out3l, descale_p##PASS); \
+ out3h = vec_sra(out3h, descale_p##PASS); \
+ \
+ out5 = vec_pack(out5l, out5h); \
+ out3 = vec_pack(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out0 = vec_sl(out0, pass1_bits); \
+ out4 = vec_sub(tmp10, tmp11); \
+ out4 = vec_sl(out4, pass1_bits); \
+ \
+ DO_FDCT_COMMON(1); \
+}
+
+#define DO_FDCT_PASS2() { \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out0 = vec_add(out0, pw_descale_p2x); \
+ out0 = vec_sra(out0, pass1_bits); \
+ out4 = vec_sub(tmp10, tmp11); \
+ out4 = vec_add(out4, pw_descale_p2x); \
+ out4 = vec_sra(out4, pass1_bits); \
+ \
+ DO_FDCT_COMMON(2); \
+}
+
+
+void jsimd_fdct_islow_altivec(DCTELEM *data)
+{
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+ z3, z4, z34l, z34h,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __vector int z3l, z3h, z4l, z4h,
+ out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+ out7l, out7h;
+
+ /* Constants */
+ __vector short
+ pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+ pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+ pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+ pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+ pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+ pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+ pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+ pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
+ pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
+ __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+ __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+ pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+ __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+ descale_p2 = { __4X(DESCALE_P2) };
+
+ /* Pass 1: process rows */
+
+ row0 = vec_ld(0, data);
+ row1 = vec_ld(16, data);
+ row2 = vec_ld(32, data);
+ row3 = vec_ld(48, data);
+ row4 = vec_ld(64, data);
+ row5 = vec_ld(80, data);
+ row6 = vec_ld(96, data);
+ row7 = vec_ld(112, data);
+
+ TRANSPOSE(row, col);
+
+ tmp0 = vec_add(col0, col7);
+ tmp7 = vec_sub(col0, col7);
+ tmp1 = vec_add(col1, col6);
+ tmp6 = vec_sub(col1, col6);
+ tmp2 = vec_add(col2, col5);
+ tmp5 = vec_sub(col2, col5);
+ tmp3 = vec_add(col3, col4);
+ tmp4 = vec_sub(col3, col4);
+
+ DO_FDCT_PASS1();
+
+ /* Pass 2: process columns */
+
+ TRANSPOSE(out, row);
+
+ tmp0 = vec_add(row0, row7);
+ tmp7 = vec_sub(row0, row7);
+ tmp1 = vec_add(row1, row6);
+ tmp6 = vec_sub(row1, row6);
+ tmp2 = vec_add(row2, row5);
+ tmp5 = vec_sub(row2, row5);
+ tmp3 = vec_add(row3, row4);
+ tmp4 = vec_sub(row3, row4);
+
+ DO_FDCT_PASS2();
+
+ vec_st(out0, 0, data);
+ vec_st(out1, 16, data);
+ vec_st(out2, 32, data);
+ vec_st(out3, 48, data);
+ vec_st(out4, 64, data);
+ vec_st(out5, 80, data);
+ vec_st(out6, 96, data);
+ vec_st(out7, 112, data);
+}
diff --git a/media/libjpeg/simd/powerpc/jidctfst-altivec.c b/media/libjpeg/simd/powerpc/jidctfst-altivec.c
new file mode 100644
index 0000000000..456c6c6174
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jidctfst-altivec.c
@@ -0,0 +1,255 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ * the elements in arg3 + the most significant 17 bits of
+ * (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_1_082 277 /* FIX(1.082392200) */
+#define F_1_414 362 /* FIX(1.414213562) */
+#define F_1_847 473 /* FIX(1.847759065) */
+#define F_2_613 669 /* FIX(2.613125930) */
+#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
+
+#define CONST_BITS 8
+#define PASS1_BITS 2
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_IDCT(in) { \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(in##0, in##4); \
+ tmp11 = vec_sub(in##0, in##4); \
+ tmp13 = vec_add(in##2, in##6); \
+ \
+ tmp12 = vec_sub(in##2, in##6); \
+ tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+ tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
+ tmp12 = vec_sub(tmp12, tmp13); \
+ \
+ tmp0 = vec_add(tmp10, tmp13); \
+ tmp3 = vec_sub(tmp10, tmp13); \
+ tmp1 = vec_add(tmp11, tmp12); \
+ tmp2 = vec_sub(tmp11, tmp12); \
+ \
+ /* Odd part */ \
+ \
+ z13 = vec_add(in##5, in##3); \
+ z10 = vec_sub(in##5, in##3); \
+ z10s = vec_sl(z10, pre_multiply_scale_bits); \
+ z11 = vec_add(in##1, in##7); \
+ z12s = vec_sub(in##1, in##7); \
+ z12s = vec_sl(z12s, pre_multiply_scale_bits); \
+ \
+ tmp11 = vec_sub(z11, z13); \
+ tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+ tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
+ \
+ tmp7 = vec_add(z11, z13); \
+ \
+ /* To avoid overflow... \
+ * \
+ * (Original) \
+ * tmp12 = -2.613125930 * z10 + z5; \
+ * \
+ * (This implementation) \
+ * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+ * = -1.613125930 * z10 - z10 + z5; \
+ */ \
+ \
+ z5 = vec_add(z10s, z12s); \
+ z5 = vec_madds(z5, pw_F1847, pw_zero); \
+ \
+ tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
+ tmp10 = vec_sub(tmp10, z5); \
+ tmp12 = vec_madds(z10s, pw_MF1613, z5); \
+ tmp12 = vec_sub(tmp12, z10); \
+ \
+ tmp6 = vec_sub(tmp12, tmp7); \
+ tmp5 = vec_sub(tmp11, tmp6); \
+ tmp4 = vec_add(tmp10, tmp5); \
+ \
+ out0 = vec_add(tmp0, tmp7); \
+ out1 = vec_add(tmp1, tmp6); \
+ out2 = vec_add(tmp2, tmp5); \
+ out3 = vec_sub(tmp3, tmp4); \
+ out4 = vec_add(tmp3, tmp4); \
+ out5 = vec_sub(tmp2, tmp5); \
+ out6 = vec_sub(tmp1, tmp6); \
+ out7 = vec_sub(tmp0, tmp7); \
+}
+
+
+void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ short *dct_table = (short *)dct_table_;
+ int *outptr;
+
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ z5, z10, z10s, z11, z12s, z13,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __vector signed char outb;
+
+ /* Constants */
+ __vector short pw_zero = { __8X(0) },
+ pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
+ pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
+ pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
+ pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
+ __vector unsigned short
+ pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
+ pass1_bits3 = { __8X(PASS1_BITS + 3) };
+ __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+ /* Pass 1: process columns */
+
+ col0 = vec_ld(0, coef_block);
+ col1 = vec_ld(16, coef_block);
+ col2 = vec_ld(32, coef_block);
+ col3 = vec_ld(48, coef_block);
+ col4 = vec_ld(64, coef_block);
+ col5 = vec_ld(80, coef_block);
+ col6 = vec_ld(96, coef_block);
+ col7 = vec_ld(112, coef_block);
+
+ tmp1 = vec_or(col1, col2);
+ tmp2 = vec_or(col3, col4);
+ tmp1 = vec_or(tmp1, tmp2);
+ tmp3 = vec_or(col5, col6);
+ tmp3 = vec_or(tmp3, col7);
+ tmp1 = vec_or(tmp1, tmp3);
+
+ quant0 = vec_ld(0, dct_table);
+ col0 = vec_mladd(col0, quant0, pw_zero);
+
+ if (vec_all_eq(tmp1, pw_zero)) {
+ /* AC terms all zero */
+
+ row0 = vec_splat(col0, 0);
+ row1 = vec_splat(col0, 1);
+ row2 = vec_splat(col0, 2);
+ row3 = vec_splat(col0, 3);
+ row4 = vec_splat(col0, 4);
+ row5 = vec_splat(col0, 5);
+ row6 = vec_splat(col0, 6);
+ row7 = vec_splat(col0, 7);
+
+ } else {
+
+ quant1 = vec_ld(16, dct_table);
+ quant2 = vec_ld(32, dct_table);
+ quant3 = vec_ld(48, dct_table);
+ quant4 = vec_ld(64, dct_table);
+ quant5 = vec_ld(80, dct_table);
+ quant6 = vec_ld(96, dct_table);
+ quant7 = vec_ld(112, dct_table);
+
+ col1 = vec_mladd(col1, quant1, pw_zero);
+ col2 = vec_mladd(col2, quant2, pw_zero);
+ col3 = vec_mladd(col3, quant3, pw_zero);
+ col4 = vec_mladd(col4, quant4, pw_zero);
+ col5 = vec_mladd(col5, quant5, pw_zero);
+ col6 = vec_mladd(col6, quant6, pw_zero);
+ col7 = vec_mladd(col7, quant7, pw_zero);
+
+ DO_IDCT(col);
+
+ TRANSPOSE(out, row);
+ }
+
+ /* Pass 2: process rows */
+
+ DO_IDCT(row);
+
+ out0 = vec_sra(out0, pass1_bits3);
+ out1 = vec_sra(out1, pass1_bits3);
+ out2 = vec_sra(out2, pass1_bits3);
+ out3 = vec_sra(out3, pass1_bits3);
+ out4 = vec_sra(out4, pass1_bits3);
+ out5 = vec_sra(out5, pass1_bits3);
+ out6 = vec_sra(out6, pass1_bits3);
+ out7 = vec_sra(out7, pass1_bits3);
+
+ TRANSPOSE(out, col);
+
+ outb = vec_packs(col0, col0);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[0] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col1, col1);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[1] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col2, col2);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[2] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col3, col3);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[3] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col4, col4);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[4] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col5, col5);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[5] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col6, col6);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[6] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col7, col7);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[7] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/media/libjpeg/simd/powerpc/jidctint-altivec.c b/media/libjpeg/simd/powerpc/jidctint-altivec.c
new file mode 100644
index 0000000000..60e619f11d
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jidctint-altivec.c
@@ -0,0 +1,357 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER INVERSE DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS) { \
+ /* Even part \
+ * \
+ * (Original) \
+ * z1 = (z2 + z3) * 0.541196100; \
+ * tmp2 = z1 + z3 * -1.847759065; \
+ * tmp3 = z1 + z2 * 0.765366865; \
+ * \
+ * (This implementation) \
+ * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+ * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+ */ \
+ \
+ in##26l = vec_mergeh(in##2, in##6); \
+ in##26h = vec_mergel(in##2, in##6); \
+ \
+ tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
+ tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
+ tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
+ tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
+ \
+ tmp0 = vec_add(in##0, in##4); \
+ tmp1 = vec_sub(in##0, in##4); \
+ \
+ tmp0l = vec_unpackh(tmp0); \
+ tmp0h = vec_unpackl(tmp0); \
+ tmp0l = vec_sl(tmp0l, const_bits); \
+ tmp0h = vec_sl(tmp0h, const_bits); \
+ tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
+ tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
+ \
+ tmp10l = vec_add(tmp0l, tmp3l); \
+ tmp10h = vec_add(tmp0h, tmp3h); \
+ tmp13l = vec_sub(tmp0l, tmp3l); \
+ tmp13h = vec_sub(tmp0h, tmp3h); \
+ \
+ tmp1l = vec_unpackh(tmp1); \
+ tmp1h = vec_unpackl(tmp1); \
+ tmp1l = vec_sl(tmp1l, const_bits); \
+ tmp1h = vec_sl(tmp1h, const_bits); \
+ tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
+ tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
+ \
+ tmp11l = vec_add(tmp1l, tmp2l); \
+ tmp11h = vec_add(tmp1h, tmp2h); \
+ tmp12l = vec_sub(tmp1l, tmp2l); \
+ tmp12h = vec_sub(tmp1h, tmp2h); \
+ \
+ /* Odd part */ \
+ \
+ z3 = vec_add(in##3, in##7); \
+ z4 = vec_add(in##1, in##5); \
+ \
+ /* (Original) \
+ * z5 = (z3 + z4) * 1.175875602; \
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
+ * z3 += z5; z4 += z5; \
+ * \
+ * (This implementation) \
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+ */ \
+ \
+ z34l = vec_mergeh(z3, z4); \
+ z34h = vec_mergel(z3, z4); \
+ \
+ z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
+ z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
+ z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
+ z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
+ \
+ /* (Original) \
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
+ * tmp0 += z1 + z3; tmp1 += z2 + z4; \
+ * tmp2 += z2 + z3; tmp3 += z1 + z4; \
+ * \
+ * (This implementation) \
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+ * tmp0 += z3; tmp1 += z4; \
+ * tmp2 += z3; tmp3 += z4; \
+ */ \
+ \
+ in##71l = vec_mergeh(in##7, in##1); \
+ in##71h = vec_mergel(in##7, in##1); \
+ \
+ tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
+ tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
+ tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
+ tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
+ \
+ in##53l = vec_mergeh(in##5, in##3); \
+ in##53h = vec_mergel(in##5, in##3); \
+ \
+ tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
+ tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
+ tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
+ tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
+ \
+ /* Final output stage */ \
+ \
+ out0l = vec_add(tmp10l, tmp3l); \
+ out0h = vec_add(tmp10h, tmp3h); \
+ out7l = vec_sub(tmp10l, tmp3l); \
+ out7h = vec_sub(tmp10h, tmp3h); \
+ \
+ out0l = vec_sra(out0l, descale_p##PASS); \
+ out0h = vec_sra(out0h, descale_p##PASS); \
+ out7l = vec_sra(out7l, descale_p##PASS); \
+ out7h = vec_sra(out7h, descale_p##PASS); \
+ \
+ out0 = vec_pack(out0l, out0h); \
+ out7 = vec_pack(out7l, out7h); \
+ \
+ out1l = vec_add(tmp11l, tmp2l); \
+ out1h = vec_add(tmp11h, tmp2h); \
+ out6l = vec_sub(tmp11l, tmp2l); \
+ out6h = vec_sub(tmp11h, tmp2h); \
+ \
+ out1l = vec_sra(out1l, descale_p##PASS); \
+ out1h = vec_sra(out1h, descale_p##PASS); \
+ out6l = vec_sra(out6l, descale_p##PASS); \
+ out6h = vec_sra(out6h, descale_p##PASS); \
+ \
+ out1 = vec_pack(out1l, out1h); \
+ out6 = vec_pack(out6l, out6h); \
+ \
+ out2l = vec_add(tmp12l, tmp1l); \
+ out2h = vec_add(tmp12h, tmp1h); \
+ out5l = vec_sub(tmp12l, tmp1l); \
+ out5h = vec_sub(tmp12h, tmp1h); \
+ \
+ out2l = vec_sra(out2l, descale_p##PASS); \
+ out2h = vec_sra(out2h, descale_p##PASS); \
+ out5l = vec_sra(out5l, descale_p##PASS); \
+ out5h = vec_sra(out5h, descale_p##PASS); \
+ \
+ out2 = vec_pack(out2l, out2h); \
+ out5 = vec_pack(out5l, out5h); \
+ \
+ out3l = vec_add(tmp13l, tmp0l); \
+ out3h = vec_add(tmp13h, tmp0h); \
+ out4l = vec_sub(tmp13l, tmp0l); \
+ out4h = vec_sub(tmp13h, tmp0h); \
+ \
+ out3l = vec_sra(out3l, descale_p##PASS); \
+ out3h = vec_sra(out3h, descale_p##PASS); \
+ out4l = vec_sra(out4l, descale_p##PASS); \
+ out4h = vec_sra(out4h, descale_p##PASS); \
+ \
+ out3 = vec_pack(out3l, out3h); \
+ out4 = vec_pack(out4l, out4h); \
+}
+
+
+void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ short *dct_table = (short *)dct_table_;
+ int *outptr;
+
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+ tmp0, tmp1, tmp2, tmp3, z3, z4,
+ z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
+ row71l, row71h, row26l, row26h, row53l, row53h,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
+ tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
+ z3l, z3h, z4l, z4h,
+ out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
+ out5l, out5h, out6l, out6h, out7l, out7h;
+ __vector signed char outb;
+
+ /* Constants */
+ __vector short pw_zero = { __8X(0) },
+ pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+ pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+ pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+ pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+ pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+ pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+ pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+ pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
+ __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+ __vector int pd_zero = { __4X(0) },
+ pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+ pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+ __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+ descale_p2 = { __4X(DESCALE_P2) },
+ const_bits = { __4X(CONST_BITS) };
+ __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+ /* Pass 1: process columns */
+
+ col0 = vec_ld(0, coef_block);
+ col1 = vec_ld(16, coef_block);
+ col2 = vec_ld(32, coef_block);
+ col3 = vec_ld(48, coef_block);
+ col4 = vec_ld(64, coef_block);
+ col5 = vec_ld(80, coef_block);
+ col6 = vec_ld(96, coef_block);
+ col7 = vec_ld(112, coef_block);
+
+ tmp1 = vec_or(col1, col2);
+ tmp2 = vec_or(col3, col4);
+ tmp1 = vec_or(tmp1, tmp2);
+ tmp3 = vec_or(col5, col6);
+ tmp3 = vec_or(tmp3, col7);
+ tmp1 = vec_or(tmp1, tmp3);
+
+ quant0 = vec_ld(0, dct_table);
+ col0 = vec_mladd(col0, quant0, pw_zero);
+
+ if (vec_all_eq(tmp1, pw_zero)) {
+ /* AC terms all zero */
+
+ col0 = vec_sl(col0, pass1_bits);
+
+ row0 = vec_splat(col0, 0);
+ row1 = vec_splat(col0, 1);
+ row2 = vec_splat(col0, 2);
+ row3 = vec_splat(col0, 3);
+ row4 = vec_splat(col0, 4);
+ row5 = vec_splat(col0, 5);
+ row6 = vec_splat(col0, 6);
+ row7 = vec_splat(col0, 7);
+
+ } else {
+
+ quant1 = vec_ld(16, dct_table);
+ quant2 = vec_ld(32, dct_table);
+ quant3 = vec_ld(48, dct_table);
+ quant4 = vec_ld(64, dct_table);
+ quant5 = vec_ld(80, dct_table);
+ quant6 = vec_ld(96, dct_table);
+ quant7 = vec_ld(112, dct_table);
+
+ col1 = vec_mladd(col1, quant1, pw_zero);
+ col2 = vec_mladd(col2, quant2, pw_zero);
+ col3 = vec_mladd(col3, quant3, pw_zero);
+ col4 = vec_mladd(col4, quant4, pw_zero);
+ col5 = vec_mladd(col5, quant5, pw_zero);
+ col6 = vec_mladd(col6, quant6, pw_zero);
+ col7 = vec_mladd(col7, quant7, pw_zero);
+
+ DO_IDCT(col, 1);
+
+ TRANSPOSE(out, row);
+ }
+
+ /* Pass 2: process rows */
+
+ DO_IDCT(row, 2);
+
+ TRANSPOSE(out, col);
+
+ outb = vec_packs(col0, col0);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[0] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col1, col1);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[1] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col2, col2);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[2] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col3, col3);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[3] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col4, col4);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[4] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col5, col5);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[5] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col6, col6);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[6] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+
+ outb = vec_packs(col7, col7);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (int *)(output_buf[7] + output_col);
+ vec_ste((__vector int)outb, 0, outptr);
+ vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/media/libjpeg/simd/powerpc/jquanti-altivec.c b/media/libjpeg/simd/powerpc/jquanti-altivec.c
new file mode 100644
index 0000000000..7d6e32542b
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jquanti-altivec.c
@@ -0,0 +1,250 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
+ * always get the data we want by using a single vector load (although we may
+ * have to permute the result.)
+ */
+#if __BIG_ENDIAN__
+
+#define LOAD_ROW(row) { \
+ elemptr = sample_data[row] + start_col; \
+ in##row = vec_ld(0, elemptr); \
+ if ((size_t)elemptr & 15) \
+ in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
+}
+
+#else
+
+#define LOAD_ROW(row) { \
+ elemptr = sample_data[row] + start_col; \
+ in##row = vec_vsx_ld(0, elemptr); \
+}
+
+#endif
+
+
+void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ JSAMPROW elemptr;
+
+ __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
+ __vector short out0, out1, out2, out3, out4, out5, out6, out7;
+
+ /* Constants */
+ __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
+ __vector unsigned char pb_zero = { __16X(0) };
+
+ LOAD_ROW(0);
+ LOAD_ROW(1);
+ LOAD_ROW(2);
+ LOAD_ROW(3);
+ LOAD_ROW(4);
+ LOAD_ROW(5);
+ LOAD_ROW(6);
+ LOAD_ROW(7);
+
+ out0 = (__vector short)VEC_UNPACKHU(in0);
+ out1 = (__vector short)VEC_UNPACKHU(in1);
+ out2 = (__vector short)VEC_UNPACKHU(in2);
+ out3 = (__vector short)VEC_UNPACKHU(in3);
+ out4 = (__vector short)VEC_UNPACKHU(in4);
+ out5 = (__vector short)VEC_UNPACKHU(in5);
+ out6 = (__vector short)VEC_UNPACKHU(in6);
+ out7 = (__vector short)VEC_UNPACKHU(in7);
+
+ out0 = vec_sub(out0, pw_centerjsamp);
+ out1 = vec_sub(out1, pw_centerjsamp);
+ out2 = vec_sub(out2, pw_centerjsamp);
+ out3 = vec_sub(out3, pw_centerjsamp);
+ out4 = vec_sub(out4, pw_centerjsamp);
+ out5 = vec_sub(out5, pw_centerjsamp);
+ out6 = vec_sub(out6, pw_centerjsamp);
+ out7 = vec_sub(out7, pw_centerjsamp);
+
+ vec_st(out0, 0, workspace);
+ vec_st(out1, 16, workspace);
+ vec_st(out2, 32, workspace);
+ vec_st(out3, 48, workspace);
+ vec_st(out4, 64, workspace);
+ vec_st(out5, 80, workspace);
+ vec_st(out6, 96, workspace);
+ vec_st(out7, 112, workspace);
+}
+
+
+#define WORD_BIT 16
+
+/* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
+ We basically need an unsigned equivalent of vec_madds(). */
+
+#define MULTIPLY(vs0, vs1, out) { \
+ tmpe = vec_mule((__vector unsigned short)vs0, \
+ (__vector unsigned short)vs1); \
+ tmpo = vec_mulo((__vector unsigned short)vs0, \
+ (__vector unsigned short)vs1); \
+ out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
+ (__vector unsigned short)tmpo, \
+ shift_pack_index); \
+}
+
+void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
+ DCTELEM *workspace)
+{
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
+ corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
+ recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
+ scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
+ __vector unsigned int tmpe, tmpo;
+
+ /* Constants */
+ __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
+#if __BIG_ENDIAN__
+ __vector unsigned char shift_pack_index =
+ { 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 };
+#else
+ __vector unsigned char shift_pack_index =
+ { 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
+#endif
+
+ row0 = vec_ld(0, workspace);
+ row1 = vec_ld(16, workspace);
+ row2 = vec_ld(32, workspace);
+ row3 = vec_ld(48, workspace);
+ row4 = vec_ld(64, workspace);
+ row5 = vec_ld(80, workspace);
+ row6 = vec_ld(96, workspace);
+ row7 = vec_ld(112, workspace);
+
+ /* Branch-less absolute value */
+ row0s = vec_sra(row0, pw_word_bit_m1);
+ row1s = vec_sra(row1, pw_word_bit_m1);
+ row2s = vec_sra(row2, pw_word_bit_m1);
+ row3s = vec_sra(row3, pw_word_bit_m1);
+ row4s = vec_sra(row4, pw_word_bit_m1);
+ row5s = vec_sra(row5, pw_word_bit_m1);
+ row6s = vec_sra(row6, pw_word_bit_m1);
+ row7s = vec_sra(row7, pw_word_bit_m1);
+ row0 = vec_xor(row0, row0s);
+ row1 = vec_xor(row1, row1s);
+ row2 = vec_xor(row2, row2s);
+ row3 = vec_xor(row3, row3s);
+ row4 = vec_xor(row4, row4s);
+ row5 = vec_xor(row5, row5s);
+ row6 = vec_xor(row6, row6s);
+ row7 = vec_xor(row7, row7s);
+ row0 = vec_sub(row0, row0s);
+ row1 = vec_sub(row1, row1s);
+ row2 = vec_sub(row2, row2s);
+ row3 = vec_sub(row3, row3s);
+ row4 = vec_sub(row4, row4s);
+ row5 = vec_sub(row5, row5s);
+ row6 = vec_sub(row6, row6s);
+ row7 = vec_sub(row7, row7s);
+
+ corr0 = vec_ld(DCTSIZE2 * 2, divisors);
+ corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
+ corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
+ corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
+ corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
+ corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
+ corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
+ corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
+
+ row0 = vec_add(row0, corr0);
+ row1 = vec_add(row1, corr1);
+ row2 = vec_add(row2, corr2);
+ row3 = vec_add(row3, corr3);
+ row4 = vec_add(row4, corr4);
+ row5 = vec_add(row5, corr5);
+ row6 = vec_add(row6, corr6);
+ row7 = vec_add(row7, corr7);
+
+ recip0 = vec_ld(0, divisors);
+ recip1 = vec_ld(16, divisors);
+ recip2 = vec_ld(32, divisors);
+ recip3 = vec_ld(48, divisors);
+ recip4 = vec_ld(64, divisors);
+ recip5 = vec_ld(80, divisors);
+ recip6 = vec_ld(96, divisors);
+ recip7 = vec_ld(112, divisors);
+
+ MULTIPLY(row0, recip0, row0);
+ MULTIPLY(row1, recip1, row1);
+ MULTIPLY(row2, recip2, row2);
+ MULTIPLY(row3, recip3, row3);
+ MULTIPLY(row4, recip4, row4);
+ MULTIPLY(row5, recip5, row5);
+ MULTIPLY(row6, recip6, row6);
+ MULTIPLY(row7, recip7, row7);
+
+ scale0 = vec_ld(DCTSIZE2 * 4, divisors);
+ scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
+ scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
+ scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
+ scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
+ scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
+ scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
+ scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
+
+ MULTIPLY(row0, scale0, row0);
+ MULTIPLY(row1, scale1, row1);
+ MULTIPLY(row2, scale2, row2);
+ MULTIPLY(row3, scale3, row3);
+ MULTIPLY(row4, scale4, row4);
+ MULTIPLY(row5, scale5, row5);
+ MULTIPLY(row6, scale6, row6);
+ MULTIPLY(row7, scale7, row7);
+
+ row0 = vec_xor(row0, row0s);
+ row1 = vec_xor(row1, row1s);
+ row2 = vec_xor(row2, row2s);
+ row3 = vec_xor(row3, row3s);
+ row4 = vec_xor(row4, row4s);
+ row5 = vec_xor(row5, row5s);
+ row6 = vec_xor(row6, row6s);
+ row7 = vec_xor(row7, row7s);
+ row0 = vec_sub(row0, row0s);
+ row1 = vec_sub(row1, row1s);
+ row2 = vec_sub(row2, row2s);
+ row3 = vec_sub(row3, row3s);
+ row4 = vec_sub(row4, row4s);
+ row5 = vec_sub(row5, row5s);
+ row6 = vec_sub(row6, row6s);
+ row7 = vec_sub(row7, row7s);
+
+ vec_st(row0, 0, coef_block);
+ vec_st(row1, 16, coef_block);
+ vec_st(row2, 32, coef_block);
+ vec_st(row3, 48, coef_block);
+ vec_st(row4, 64, coef_block);
+ vec_st(row5, 80, coef_block);
+ vec_st(row6, 96, coef_block);
+ vec_st(row7, 112, coef_block);
+}
diff --git a/media/libjpeg/simd/powerpc/jsimd.c b/media/libjpeg/simd/powerpc/jsimd.c
new file mode 100644
index 0000000000..461f603633
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jsimd.c
@@ -0,0 +1,884 @@
+/*
+ * jsimd_powerpc.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014-2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * PowerPC architecture.
+ */
+
+#ifdef __amigaos4__
+/* This must be defined first as it re-defines GLOBAL otherwise */
+#include <proto/exec.h>
+#endif
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <ctype.h>
+
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#elif defined(__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#elif defined(__FreeBSD__)
+#include <machine/cpu.h>
+#include <sys/auxv.h>
+#endif
+
+static THREAD_LOCAL unsigned int simd_support = ~0;
+
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+ char *p;
+
+ if (*feature == 0)
+ return 0;
+ if (strncmp(buffer, "cpu", 3) != 0)
+ return 0;
+ buffer += 3;
+ while (isspace(*buffer))
+ buffer++;
+
+ /* Check if 'feature' is present in the buffer as a separate word */
+ while ((p = strstr(buffer, feature))) {
+ if (p > buffer && !isspace(*(p - 1))) {
+ buffer++;
+ continue;
+ }
+ p += strlen(feature);
+ if (*p != 0 && !isspace(*p)) {
+ buffer++;
+ continue;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+ char *buffer = (char *)malloc(bufsize);
+ FILE *fd;
+
+ simd_support = 0;
+
+ if (!buffer)
+ return 0;
+
+ fd = fopen("/proc/cpuinfo", "r");
+ if (fd) {
+ while (fgets(buffer, bufsize, fd)) {
+ if (!strchr(buffer, '\n') && !feof(fd)) {
+ /* "impossible" happened - insufficient size of the buffer! */
+ fclose(fd);
+ free(buffer);
+ return 0;
+ }
+ if (check_feature(buffer, "altivec"))
+ simd_support |= JSIMD_ALTIVEC;
+ }
+ fclose(fd);
+ }
+ free(buffer);
+ return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char *env = NULL;
+#endif
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+ int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#elif defined(__amigaos4__)
+ uint32 altivec = 0;
+#elif defined(__APPLE__)
+ int mib[2] = { CTL_HW, HW_VECTORUNIT };
+ int altivec;
+ size_t len = sizeof(altivec);
+#elif defined(__OpenBSD__)
+ int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
+ int altivec;
+ size_t len = sizeof(altivec);
+#elif defined(__FreeBSD__)
+ unsigned long cpufeatures = 0;
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+#if defined(__ALTIVEC__)
+ simd_support |= JSIMD_ALTIVEC;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+ while (!parse_proc_cpuinfo(bufsize)) {
+ bufsize *= 2;
+ if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+ break;
+ }
+#elif defined(__amigaos4__)
+ IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE);
+ if (altivec == VECTORTYPE_ALTIVEC)
+ simd_support |= JSIMD_ALTIVEC;
+#elif defined(__APPLE__) || defined(__OpenBSD__)
+ if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0)
+ simd_support |= JSIMD_ALTIVEC;
+#elif defined(__FreeBSD__)
+ elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
+ if (cpufeatures & PPC_FEATURE_HAS_ALTIVEC)
+ simd_support |= JSIMD_ALTIVEC;
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ env = getenv("JSIMD_FORCEALTIVEC");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = JSIMD_ALTIVEC;
+ env = getenv("JSIMD_FORCENONE");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct = jsimd_extrgb_ycc_convert_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct = jsimd_extrgbx_ycc_convert_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct = jsimd_extbgr_ycc_convert_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct = jsimd_extbgrx_ycc_convert_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct = jsimd_extxbgr_ycc_convert_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct = jsimd_extxrgb_ycc_convert_altivec;
+ break;
+ default:
+ altivecfct = jsimd_rgb_ycc_convert_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct = jsimd_extrgb_gray_convert_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct = jsimd_extrgbx_gray_convert_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct = jsimd_extbgr_gray_convert_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct = jsimd_extbgrx_gray_convert_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct = jsimd_extxbgr_gray_convert_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct = jsimd_extxrgb_gray_convert_altivec;
+ break;
+ default:
+ altivecfct = jsimd_rgb_gray_convert_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct = jsimd_ycc_extrgb_convert_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct = jsimd_ycc_extrgbx_convert_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct = jsimd_ycc_extbgr_convert_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct = jsimd_ycc_extbgrx_convert_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct = jsimd_ycc_extxbgr_convert_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct = jsimd_ycc_extxrgb_convert_altivec;
+ break;
+ default:
+ altivecfct = jsimd_ycc_rgb_convert_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec;
+ break;
+ default:
+ altivecfct = jsimd_h2v2_merged_upsample_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec;
+ break;
+ case JCS_EXT_BGR:
+ altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec;
+ break;
+ default:
+ altivecfct = jsimd_h2v1_merged_upsample_altivec;
+ break;
+ }
+
+ altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ jsimd_convsamp_altivec(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_altivec(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ALTIVEC)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *absvalues, size_t *bits)
+{
+ return 0;
+}
diff --git a/media/libjpeg/simd/powerpc/jsimd_altivec.h b/media/libjpeg/simd/powerpc/jsimd_altivec.h
new file mode 100644
index 0000000000..e8bdb06a54
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jsimd_altivec.h
@@ -0,0 +1,98 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include <altivec.h>
+
+
+/* Common code */
+
+#define __4X(a) a, a, a, a
+#define __4X2(a, b) a, b, a, b, a, b, a, b
+#define __8X(a) __4X(a), __4X(a)
+#define __16X(a) __8X(a), __8X(a)
+
+#define TRANSPOSE(row, col) { \
+ __vector short row04l, row04h, row15l, row15h, \
+ row26l, row26h, row37l, row37h; \
+ __vector short col01e, col01o, col23e, col23o, \
+ col45e, col45o, col67e, col67o; \
+ \
+ /* transpose coefficients (phase 1) */ \
+ row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
+ row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
+ row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
+ row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
+ row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
+ row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
+ row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
+ row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
+ \
+ /* transpose coefficients (phase 2) */ \
+ col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
+ col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
+ col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
+ col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
+ col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
+ col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
+ col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
+ col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
+ \
+ /* transpose coefficients (phase 3) */ \
+ col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
+ col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
+ col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
+ col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
+ col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
+ col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
+ col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
+ col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
+}
+
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+
+/* Macros to abstract big/little endian bit twiddling */
+
+#if __BIG_ENDIAN__
+
+#define VEC_LD(a, b) vec_ld(a, b)
+#define VEC_ST(a, b, c) vec_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
+#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
+
+#else
+
+#define VEC_LD(a, b) vec_vsx_ld(a, b)
+#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
+#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
+
+#endif
diff --git a/media/libjpeg/simd/x86_64/jccolext-avx2.asm b/media/libjpeg/simd/x86_64/jccolext-avx2.asm
new file mode 100644
index 0000000000..ffb527db00
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolext-avx2.asm
@@ -0,0 +1,559 @@
+;
+; jccolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
+%define WK_NUM 8
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 5
+ push rbx
+
+ mov ecx, r10d
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rsi, r12
+ mov ecx, r13d
+ mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+ lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+ lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+ lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rsi, r11
+ mov eax, r14d
+ test rax, rax
+ jle near .return
+.rowloop:
+ push rdx
+ push rbx
+ push rdi
+ push rsi
+ push rcx ; col
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr0
+ mov rbxp, JSAMPROW [rbx] ; outptr1
+ mov rdxp, JSAMPROW [rdx] ; outptr2
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push rax
+ push rdx
+ lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_BYTE
+ movzx rax, byte [rsi+rcx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_WORD
+ movzx rdx, word [rsi+rcx]
+ shl rax, WORD_BIT
+ or rax, rdx
+.column_ld4:
+ vmovd xmmA, eax
+ pop rdx
+ pop rax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_DWORD
+ vmovd xmmF, XMM_DWORD [rsi+rcx]
+ vpslldq xmmA, xmmA, SIZEOF_DWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_MMWORD
+ vmovq xmmB, XMM_MMWORD [rsi+rcx]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ sub rcx, byte SIZEOF_XMMWORD
+ vmovdqu xmmB, XMM_MMWORD [rsi+rcx]
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ vpor ymmA, ymmB
+.column_ld32:
+ test cl, SIZEOF_YMMWORD
+ jz short .column_ld64
+ sub rcx, byte SIZEOF_YMMWORD
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+ test cl, 2*SIZEOF_YMMWORD
+ mov rcx, SIZEOF_YMMWORD
+ jz short .rgb_ycc_cnv
+ vmovdqa ymmB, ymmA
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_ycc_cnv
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+ ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vmovdqu ymmC, ymmA
+ vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+ vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+ vmovdqa ymmG, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+ ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+ vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+ ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+ ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+ vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+ ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+ vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+ ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+ vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+ ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+ vmovdqa ymmD, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+ ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+ vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+ ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+ vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+ ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+ ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+ ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+ vmovdqa ymmE, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+ ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+ vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+ ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+ ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+ ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+ vpxor ymmH, ymmH, ymmH
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmB, ymmE
+ vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+ vmovdqa ymmF, ymmD
+ vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_XMMWORD/16
+ vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_XMMWORD/4
+ vmovdqa xmmF, xmmA
+ vperm2i128 ymmF, ymmF, ymmF, 1
+ vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+ vpor ymmA, ymmA, ymmF
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_XMMWORD/2
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ mov rcx, SIZEOF_YMMWORD
+ jz short .rgb_ycc_cnv
+ vmovdqa ymmE, ymmA
+ vmovdqa ymmH, ymmF
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_ycc_cnv
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+ vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+ ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmB, ymmA
+ vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+ vmovdqa ymmB, ymmF
+ vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+ vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmD, ymmA
+ vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+ ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+ vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+ ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+ vmovdqa ymmC, ymmF
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+ ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+ ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+ vmovdqa ymmB, ymmA
+ vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+ vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+ ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+ vmovdqa ymmG, ymmD
+ vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+ ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+ vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+ ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+ vmovdqa ymmE, ymmA
+ vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vmovdqa ymmH, ymmB
+ vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+ ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+ ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+ vpxor ymmF, ymmF, ymmF
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmD, ymmB
+ vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+ vmovdqa ymmG, ymmE
+ vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vpunpcklbw ymmF, ymmF, ymmH
+ vpunpckhbw ymmH, ymmH, ymmH
+ vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+ vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+ ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE
+ vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO
+ vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE
+ vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO
+
+ vmovdqa ymm6, ymm1
+ vpunpcklwd ymm1, ymm1, ymm3
+ vpunpckhwd ymm6, ymm6, ymm3
+ vmovdqa ymm7, ymm1
+ vmovdqa ymm4, ymm6
+ vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ vpmaddwd ymm7, ymm7, [rel PW_MF016_MF033] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ vpxor ymm1, ymm1, ymm1
+ vpxor ymm6, ymm6, ymm6
+ vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL
+ vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH
+ vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500)
+ vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500)
+
+ vmovdqa ymm5, [rel PD_ONEHALFM1_CJ] ; ymm5=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm7, ymm7, ymm1
+ vpaddd ymm4, ymm4, ymm6
+ vpaddd ymm7, ymm7, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH
+ vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO
+
+ vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE
+
+ vmovdqa ymm6, ymm0
+ vpunpcklwd ymm0, ymm0, ymm2
+ vpunpckhwd ymm6, ymm6, ymm2
+ vmovdqa ymm5, ymm0
+ vmovdqa ymm4, ymm6
+ vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ vpmaddwd ymm5, ymm5, [rel PW_MF016_MF033] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ vpxor ymm0, ymm0, ymm0
+ vpxor ymm6, ymm6, ymm6
+ vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL
+ vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH
+ vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500)
+ vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500)
+
+ vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm5, ymm5, ymm0
+ vpaddd ymm4, ymm4, ymm6
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm4, ymm4, ymm1
+ vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH
+ vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE
+
+ vpsllw ymm7, ymm7, BYTE_BIT
+ vpor ymm5, ymm5, ymm7 ; ymm5=Cb
+ vmovdqu YMMWORD [rbx], ymm5 ; Save Cb
+
+ vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO
+ vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE
+ vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO
+
+ vmovdqa ymm4, ymm0
+ vpunpcklwd ymm0, ymm0, ymm3
+ vpunpckhwd ymm4, ymm4, ymm3
+ vmovdqa ymm7, ymm0
+ vmovdqa ymm5, ymm4
+ vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ vpmaddwd ymm7, ymm7, [rel PW_MF008_MF041] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF]
+
+ vpaddd ymm0, ymm0, YMMWORD [wk(4)]
+ vpaddd ymm4, ymm4, YMMWORD [wk(5)]
+ vpaddd ymm0, ymm0, ymm3
+ vpaddd ymm4, ymm4, ymm3
+ vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
+ vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
+
+ vpxor ymm3, ymm3, ymm3
+ vpxor ymm4, ymm4, ymm4
+ vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL
+ vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH
+ vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500)
+ vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500)
+
+ vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm7, ymm7, ymm3
+ vpaddd ymm5, ymm5, ymm4
+ vpaddd ymm7, ymm7, ymm1
+ vpaddd ymm5, ymm5, ymm1
+ vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL
+ vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH
+ vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO
+
+ vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE
+
+ vmovdqa ymm4, ymm6
+ vpunpcklwd ymm6, ymm6, ymm2
+ vpunpckhwd ymm4, ymm4, ymm2
+ vmovdqa ymm1, ymm6
+ vmovdqa ymm5, ymm4
+ vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ vpmaddwd ymm1, ymm1, [rel PW_MF008_MF041] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF]
+
+ vpaddd ymm6, ymm6, YMMWORD [wk(6)]
+ vpaddd ymm4, ymm4, YMMWORD [wk(7)]
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm4, ymm4, ymm2
+ vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
+ vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
+
+ vpsllw ymm0, ymm0, BYTE_BIT
+ vpor ymm6, ymm6, ymm0 ; ymm6=Y
+ vmovdqu YMMWORD [rdi], ymm6 ; Save Y
+
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm4, ymm4, ymm4
+ vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL
+ vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH
+ vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500)
+ vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500)
+
+ vmovdqa ymm0, [rel PD_ONEHALFM1_CJ] ; ymm0=[PD_ONEHALFM1_CJ]
+
+ vpaddd ymm1, ymm1, ymm2
+ vpaddd ymm5, ymm5, ymm4
+ vpaddd ymm1, ymm1, ymm0
+ vpaddd ymm5, ymm5, ymm0
+ vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL
+ vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH
+ vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE
+
+ vpsllw ymm7, ymm7, BYTE_BIT
+ vpor ymm1, ymm1, ymm7 ; ymm1=Cr
+ vmovdqu YMMWORD [rdx], ymm1 ; Save Cr
+
+ sub rcx, byte SIZEOF_YMMWORD
+ add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
+ add rdi, byte SIZEOF_YMMWORD ; outptr0
+ add rbx, byte SIZEOF_YMMWORD ; outptr1
+ add rdx, byte SIZEOF_YMMWORD ; outptr2
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .column_ld1
+
+ pop rcx ; col
+ pop rsi
+ pop rdi
+ pop rbx
+ pop rdx
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_buf
+ add rdi, byte SIZEOF_JSAMPROW
+ add rbx, byte SIZEOF_JSAMPROW
+ add rdx, byte SIZEOF_JSAMPROW
+ dec rax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop rbx
+ vzeroupper
+ uncollect_args 5
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jccolext-sse2.asm b/media/libjpeg/simd/x86_64/jccolext-sse2.asm
new file mode 100644
index 0000000000..af70ed6010
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolext-sse2.asm
@@ -0,0 +1,484 @@
+;
+; jccolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 8
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 5
+ push rbx
+
+ mov ecx, r10d
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rsi, r12
+ mov ecx, r13d
+ mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+ lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+ lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+ lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rsi, r11
+ mov eax, r14d
+ test rax, rax
+ jle near .return
+.rowloop:
+ push rdx
+ push rbx
+ push rdi
+ push rsi
+ push rcx ; col
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr0
+ mov rbxp, JSAMPROW [rbx] ; outptr1
+ mov rdxp, JSAMPROW [rdx] ; outptr2
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push rax
+ push rdx
+ lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_BYTE
+ movzx rax, byte [rsi+rcx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_WORD
+ movzx rdx, word [rsi+rcx]
+ shl rax, WORD_BIT
+ or rax, rdx
+.column_ld4:
+ movd xmmA, eax
+ pop rdx
+ pop rax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF, xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .rgb_ycc_cnv
+.column_ld32:
+ test cl, 2*SIZEOF_XMMWORD
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmB, xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+
+.columnloop:
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ movdqa xmmG, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+ punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+ movdqa xmmD, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+ punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+ movdqa xmmE, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+ punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+ pxor xmmH, xmmH
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmB, xmmE
+ punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+ movdqa xmmF, xmmD
+ punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmE
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE, xmmA
+ movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmF, xmmA
+ movdqa xmmH, xmmE
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+
+.columnloop:
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC, xmmF
+ punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB, xmmA
+ punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG, xmmD
+ punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE, xmmA
+ punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH, xmmB
+ punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF, xmmF
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD, xmmB
+ punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG, xmmE
+ punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF, xmmH
+ punpckhbw xmmH, xmmH
+ psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
+ movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
+
+ movdqa xmm6, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm6, xmm3
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm6
+ pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor xmm1, xmm1
+ pxor xmm6, xmm6
+ punpcklwd xmm1, xmm5 ; xmm1=BOL
+ punpckhwd xmm6, xmm5 ; xmm6=BOH
+ psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
+ psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
+
+ movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7, xmm1
+ paddd xmm4, xmm6
+ paddd xmm7, xmm5
+ paddd xmm4, xmm5
+ psrld xmm7, SCALEBITS ; xmm7=CbOL
+ psrld xmm4, SCALEBITS ; xmm4=CbOH
+ packssdw xmm7, xmm4 ; xmm7=CbO
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
+
+ movdqa xmm6, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm6, xmm2
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm6
+ pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor xmm0, xmm0
+ pxor xmm6, xmm6
+ punpcklwd xmm0, xmm1 ; xmm0=BEL
+ punpckhwd xmm6, xmm1 ; xmm6=BEH
+ psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
+ psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
+
+ movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm5, xmm0
+ paddd xmm4, xmm6
+ paddd xmm5, xmm1
+ paddd xmm4, xmm1
+ psrld xmm5, SCALEBITS ; xmm5=CbEL
+ psrld xmm4, SCALEBITS ; xmm4=CbEH
+ packssdw xmm5, xmm4 ; xmm5=CbE
+
+ psllw xmm7, BYTE_BIT
+ por xmm5, xmm7 ; xmm5=Cb
+ movdqa XMMWORD [rbx], xmm5 ; Save Cb
+
+ movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm4, xmm3
+ movdqa xmm7, xmm0
+ movdqa xmm5, xmm4
+ pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, XMMWORD [wk(4)]
+ paddd xmm4, XMMWORD [wk(5)]
+ paddd xmm0, xmm3
+ paddd xmm4, xmm3
+ psrld xmm0, SCALEBITS ; xmm0=YOL
+ psrld xmm4, SCALEBITS ; xmm4=YOH
+ packssdw xmm0, xmm4 ; xmm0=YO
+
+ pxor xmm3, xmm3
+ pxor xmm4, xmm4
+ punpcklwd xmm3, xmm1 ; xmm3=ROL
+ punpckhwd xmm4, xmm1 ; xmm4=ROH
+ psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
+ psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
+
+ movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7, xmm3
+ paddd xmm5, xmm4
+ paddd xmm7, xmm1
+ paddd xmm5, xmm1
+ psrld xmm7, SCALEBITS ; xmm7=CrOL
+ psrld xmm5, SCALEBITS ; xmm5=CrOH
+ packssdw xmm7, xmm5 ; xmm7=CrO
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
+
+ movdqa xmm4, xmm6
+ punpcklwd xmm6, xmm2
+ punpckhwd xmm4, xmm2
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm4
+ pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(6)]
+ paddd xmm4, XMMWORD [wk(7)]
+ paddd xmm6, xmm2
+ paddd xmm4, xmm2
+ psrld xmm6, SCALEBITS ; xmm6=YEL
+ psrld xmm4, SCALEBITS ; xmm4=YEH
+ packssdw xmm6, xmm4 ; xmm6=YE
+
+ psllw xmm0, BYTE_BIT
+ por xmm6, xmm0 ; xmm6=Y
+ movdqa XMMWORD [rdi], xmm6 ; Save Y
+
+ pxor xmm2, xmm2
+ pxor xmm4, xmm4
+ punpcklwd xmm2, xmm3 ; xmm2=REL
+ punpckhwd xmm4, xmm3 ; xmm4=REH
+ psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
+ psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
+
+ movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
+
+ paddd xmm1, xmm2
+ paddd xmm5, xmm4
+ paddd xmm1, xmm0
+ paddd xmm5, xmm0
+ psrld xmm1, SCALEBITS ; xmm1=CrEL
+ psrld xmm5, SCALEBITS ; xmm5=CrEH
+ packssdw xmm1, xmm5 ; xmm1=CrE
+
+ psllw xmm7, BYTE_BIT
+ por xmm1, xmm7 ; xmm1=Cr
+ movdqa XMMWORD [rdx], xmm1 ; Save Cr
+
+ sub rcx, byte SIZEOF_XMMWORD
+ add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add rdi, byte SIZEOF_XMMWORD ; outptr0
+ add rbx, byte SIZEOF_XMMWORD ; outptr1
+ add rdx, byte SIZEOF_XMMWORD ; outptr2
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .column_ld1
+
+ pop rcx ; col
+ pop rsi
+ pop rdi
+ pop rbx
+ pop rdx
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_buf
+ add rdi, byte SIZEOF_JSAMPROW
+ add rbx, byte SIZEOF_JSAMPROW
+ add rdx, byte SIZEOF_JSAMPROW
+ dec rax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop rbx
+ uncollect_args 5
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jccolor-avx2.asm b/media/libjpeg/simd/x86_64/jccolor-avx2.asm
new file mode 100644
index 0000000000..16b78298dc
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolor-avx2.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PW_MF016_MF033 times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041 times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \
+ (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jccolor-sse2.asm b/media/libjpeg/simd/x86_64/jccolor-sse2.asm
new file mode 100644
index 0000000000..e2955c2134
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolor-sse2.asm
@@ -0,0 +1,120 @@
+;
+; jccolor.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PW_MF016_MF033 times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041 times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \
+ (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgray-avx2.asm b/media/libjpeg/simd/x86_64/jcgray-avx2.asm
new file mode 100644
index 0000000000..591255bb11
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgray-avx2.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgray-sse2.asm b/media/libjpeg/simd/x86_64/jcgray-sse2.asm
new file mode 100644
index 0000000000..e389904f2f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgray-sse2.asm
@@ -0,0 +1,112 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgryext-avx2.asm b/media/libjpeg/simd/x86_64/jcgryext-avx2.asm
new file mode 100644
index 0000000000..ddcc2c0a2f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgryext-avx2.asm
@@ -0,0 +1,438 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 5
+ push rbx
+
+ mov ecx, r10d
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rsi, r12
+ mov ecx, r13d
+ mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rsi, r11
+ mov eax, r14d
+ test rax, rax
+ jle near .return
+.rowloop:
+ push rdi
+ push rsi
+ push rcx ; col
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr0
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push rax
+ push rdx
+ lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_BYTE
+ movzx rax, byte [rsi+rcx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_WORD
+ movzx rdx, word [rsi+rcx]
+ shl rax, WORD_BIT
+ or rax, rdx
+.column_ld4:
+ vmovd xmmA, eax
+ pop rdx
+ pop rax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_DWORD
+ vmovd xmmF, XMM_DWORD [rsi+rcx]
+ vpslldq xmmA, xmmA, SIZEOF_DWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_MMWORD
+ vmovq xmmB, XMM_MMWORD [rsi+rcx]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ sub rcx, byte SIZEOF_XMMWORD
+ vmovdqu xmmB, XMM_MMWORD [rsi+rcx]
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ vpor ymmA, ymmB
+.column_ld32:
+ test cl, SIZEOF_YMMWORD
+ jz short .column_ld64
+ sub rcx, byte SIZEOF_YMMWORD
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+ test cl, 2*SIZEOF_YMMWORD
+ mov rcx, SIZEOF_YMMWORD
+ jz short .rgb_gray_cnv
+ vmovdqa ymmB, ymmA
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_gray_cnv
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+ ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vmovdqu ymmC, ymmA
+ vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+ vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+ vmovdqa ymmG, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+ ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+ vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+ ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+ ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+ vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+ ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+ vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+ ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+ vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+ ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+ vmovdqa ymmD, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+ ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+ vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+ ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+ vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+ ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+ ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+ ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+ vmovdqa ymmE, ymmA
+ vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+ ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+ vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+ ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+ vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+ ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+ ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+ vpxor ymmH, ymmH, ymmH
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmB, ymmE
+ vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+ vmovdqa ymmF, ymmD
+ vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_XMMWORD/16
+ vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+ vpslldq xmmA, xmmA, SIZEOF_MMWORD
+ vpor xmmA, xmmA, xmmF
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_XMMWORD/4
+ vmovdqa xmmF, xmmA
+ vperm2i128 ymmF, ymmF, ymmF, 1
+ vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+ vpor ymmA, ymmA, ymmF
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_XMMWORD/2
+ vmovdqa ymmF, ymmA
+ vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ mov rcx, SIZEOF_YMMWORD
+ jz short .rgb_gray_cnv
+ vmovdqa ymmE, ymmA
+ vmovdqa ymmH, ymmF
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ jmp short .rgb_gray_cnv
+
+.columnloop:
+ vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+ vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+ ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmB, ymmA
+ vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+ vmovdqa ymmB, ymmF
+ vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+ vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ vmovdqa ymmD, ymmA
+ vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+ ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+ vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+ ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+ vmovdqa ymmC, ymmF
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+ ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+ vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+ ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+ vmovdqa ymmB, ymmA
+ vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+ ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+ vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+ ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+ vmovdqa ymmG, ymmD
+ vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+ ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+ vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+ ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+ vmovdqa ymmE, ymmA
+ vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+ ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+ vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+ ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vmovdqa ymmH, ymmB
+ vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+ ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+ vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+ ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+ vpxor ymmF, ymmF, ymmF
+
+ vmovdqa ymmC, ymmA
+ vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+ vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+ vmovdqa ymmD, ymmB
+ vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+ vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+ vmovdqa ymmG, ymmE
+ vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+ vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+ vpunpcklbw ymmF, ymmF, ymmH
+ vpunpckhbw ymmH, ymmH, ymmH
+ vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+ vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+ ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+ vmovdqa ymm6, ymm1
+ vpunpcklwd ymm1, ymm1, ymm3
+ vpunpckhwd ymm6, ymm6, ymm3
+ vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ vmovdqa ymm6, ymm0
+ vpunpcklwd ymm0, ymm0, ymm2
+ vpunpckhwd ymm6, ymm6, ymm2
+ vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+ vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ vmovdqa ymm0, ymm5 ; ymm0=BO
+ vmovdqa ymm6, ymm4 ; ymm6=BE
+
+ vmovdqa ymm4, ymm0
+ vpunpcklwd ymm0, ymm0, ymm3
+ vpunpckhwd ymm4, ymm4, ymm3
+ vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+ vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF]
+
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm4, ymm4, ymm7
+ vpaddd ymm0, ymm0, ymm3
+ vpaddd ymm4, ymm4, ymm3
+ vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
+ vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
+
+ vmovdqa ymm4, ymm6
+ vpunpcklwd ymm6, ymm6, ymm2
+ vpunpckhwd ymm4, ymm4, ymm2
+ vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+ vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF]
+
+ vpaddd ymm6, ymm6, YMMWORD [wk(0)]
+ vpaddd ymm4, ymm4, YMMWORD [wk(1)]
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm4, ymm4, ymm2
+ vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
+ vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
+ vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
+
+ vpsllw ymm0, ymm0, BYTE_BIT
+ vpor ymm6, ymm6, ymm0 ; ymm6=Y
+ vmovdqu YMMWORD [rdi], ymm6 ; Save Y
+
+ sub rcx, byte SIZEOF_YMMWORD
+ add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
+ add rdi, byte SIZEOF_YMMWORD ; outptr0
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .column_ld1
+
+ pop rcx ; col
+ pop rsi
+ pop rdi
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_buf
+ add rdi, byte SIZEOF_JSAMPROW
+ dec rax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop rbx
+ vzeroupper
+ uncollect_args 5
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jcgryext-sse2.asm b/media/libjpeg/simd/x86_64/jcgryext-sse2.asm
new file mode 100644
index 0000000000..f1d399a63b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgryext-sse2.asm
@@ -0,0 +1,363 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+; JSAMPIMAGE output_buf, JDIMENSION output_row,
+; int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 5
+ push rbx
+
+ mov ecx, r10d
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rsi, r12
+ mov ecx, r13d
+ mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rsi, r11
+ mov eax, r14d
+ test rax, rax
+ jle near .return
+.rowloop:
+ push rdi
+ push rsi
+ push rcx ; col
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr0
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push rax
+ push rdx
+ lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_BYTE
+ movzx rax, byte [rsi+rcx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_WORD
+ movzx rdx, word [rsi+rcx]
+ shl rax, WORD_BIT
+ or rax, rdx
+.column_ld4:
+ movd xmmA, eax
+ pop rdx
+ pop rax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA, xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF, xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .rgb_gray_cnv
+.column_ld32:
+ test cl, 2*SIZEOF_XMMWORD
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_gray_cnv
+ movdqa xmmB, xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_gray_cnv
+
+.columnloop:
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ movdqa xmmG, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+ punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+ movdqa xmmD, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+ punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+ movdqa xmmE, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+ punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+ pxor xmmH, xmmH
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmB, xmmE
+ punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+ movdqa xmmF, xmmD
+ punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmE
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE, xmmA
+ movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_gray_cnv
+ movdqa xmmF, xmmA
+ movdqa xmmH, xmmE
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_gray_cnv
+
+.columnloop:
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC, xmmF
+ punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB, xmmA
+ punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG, xmmD
+ punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE, xmmA
+ punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH, xmmB
+ punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF, xmmF
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD, xmmB
+ punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG, xmmE
+ punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF, xmmH
+ punpckhbw xmmH, xmmH
+ psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+ movdqa xmm6, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm6, xmm3
+ pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movdqa xmm6, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm6, xmm2
+ pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movdqa xmm0, xmm5 ; xmm0=BO
+ movdqa xmm6, xmm4 ; xmm6=BE
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm4, xmm3
+ pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+ movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, xmm1
+ paddd xmm4, xmm7
+ paddd xmm0, xmm3
+ paddd xmm4, xmm3
+ psrld xmm0, SCALEBITS ; xmm0=YOL
+ psrld xmm4, SCALEBITS ; xmm4=YOH
+ packssdw xmm0, xmm4 ; xmm0=YO
+
+ movdqa xmm4, xmm6
+ punpcklwd xmm6, xmm2
+ punpckhwd xmm4, xmm2
+ pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+ movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(0)]
+ paddd xmm4, XMMWORD [wk(1)]
+ paddd xmm6, xmm2
+ paddd xmm4, xmm2
+ psrld xmm6, SCALEBITS ; xmm6=YEL
+ psrld xmm4, SCALEBITS ; xmm4=YEH
+ packssdw xmm6, xmm4 ; xmm6=YE
+
+ psllw xmm0, BYTE_BIT
+ por xmm6, xmm0 ; xmm6=Y
+ movdqa XMMWORD [rdi], xmm6 ; Save Y
+
+ sub rcx, byte SIZEOF_XMMWORD
+ add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add rdi, byte SIZEOF_XMMWORD ; outptr0
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .column_ld1
+
+ pop rcx ; col
+ pop rsi
+ pop rdi
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_buf
+ add rdi, byte SIZEOF_JSAMPROW
+ dec rax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop rbx
+ uncollect_args 5
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jchuff-sse2.asm b/media/libjpeg/simd/x86_64/jchuff-sse2.asm
new file mode 100644
index 0000000000..9ea6df946e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jchuff-sse2.asm
@@ -0,0 +1,583 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based on jchuff.c; see jchuff.c for more details.
+
+%include "jsimdext.inc"
+
+struc working_state
+.next_output_byte: resp 1 ; => next byte to write in buffer
+.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1 ; current bit accumulation buffer
+.cur.free_bits resd 1 ; # of bits available in it
+.cur.last_dc_val resd 4 ; last DC coef for each component
+.cinfo: resp 1 ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco: resd 256 ; code for each symbol
+.ehufsi: resb 256 ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
+ dd 0x000f, 0x001f, 0x003f, 0x007f
+ dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
+ dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+ alignz 32
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 << 9 db 10
+times 1 << 8 db 9
+times 1 << 7 db 8
+times 1 << 6 db 7
+times 1 << 5 db 6
+times 1 << 4 db 5
+times 1 << 3 db 4
+times 1 << 2 db 3
+times 1 << 1 db 2
+times 1 << 0 db 1
+times 1 db 0
+jpeg_nbits_table:
+times 1 db 0
+times 1 << 0 db 1
+times 1 << 1 db 2
+times 1 << 2 db 3
+times 1 << 3 db 4
+times 1 << 4 db 5
+times 1 << 5 db 6
+times 1 << 6 db 7
+times 1 << 7 db 8
+times 1 << 8 db 9
+times 1 << 9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+times 1 << 15 db 16
+
+ alignz 32
+
+%define NBITS(x) nbits_base + x
+%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+; Shorthand used to describe SIMD operations:
+; wN: xmmN treated as eight signed 16-bit values
+; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
+; bN: xmmN treated as 16 unsigned 8-bit values
+; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - the label to which to jump when the macro completes
+; %2 (optional) - extra instructions to execute after nbits has been set
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits. temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 1-2
+ add nbitsb, free_bitsb ; nbits += free_bits;
+ neg free_bitsb ; free_bits = -free_bits;
+ mov tempd, code ; temp = code;
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ mov nbitsb, free_bitsb ; nbits = free_bits;
+ neg free_bitsb ; free_bits = -free_bits;
+ shr tempd, nbitsb ; temp >>= nbits;
+ or tempq, put_buffer ; temp |= put_buffer;
+ movq xmm0, tempq ; xmm0.u64 = { temp, 0 };
+ bswap tempq ; temp = htonl(temp);
+ mov put_buffer, codeq ; put_buffer = code;
+ pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
+ %2
+ pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i);
+ mov qword [buffer], tempq ; memcpy(buffer, &temp, 8);
+ ; (speculative; will be overwritten if
+ ; code contains any 0xFF bytes)
+ add free_bitsb, 64 ; free_bits += 64;
+ add bufferp, 8 ; buffer += 8;
+ test code, code ; if (code == 0) /* No 0xFF bytes */
+ jz %1 ; return;
+ ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+ ; bytes in the qword.
+ cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer-7], 0 ; buffer[-7] = 0;
+ sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], temph ; buffer[0] = temp[1];
+ cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ shr tempq, 16 ; temp >>= 16;
+ mov byte [buffer], tempb ; buffer[0] = temp[0];
+ cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], temph ; buffer[0] = temp[1];
+ cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ shr tempq, 16 ; temp >>= 16;
+ mov byte [buffer], tempb ; buffer[0] = temp[0];
+ cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], temph ; buffer[0] = temp[1];
+ cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ shr tempd, 16 ; temp >>= 16;
+ mov byte [buffer], tempb ; buffer[0] = temp[0];
+ cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], temph ; buffer[0] = temp[1];
+ cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ jmp %1 ; return;
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+; JCOEFPTR block, int last_dc_val,
+; c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel. In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support. The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.) This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; rax - buffer
+; rbx - temp
+; rcx - nbits
+; rdx - block --> free_bits
+; rsi - nbits_base
+; rdi - t
+; rbp - code
+; r8 - dctbl --> code_temp
+; r9 - actbl
+; r10 - state
+; r11 - index
+; r12 - put_buffer
+
+%define buffer rax
+%ifdef WIN64
+%define bufferp rax
+%else
+%define bufferp raxp
+%endif
+%define tempq rbx
+%define tempd ebx
+%define tempb bl
+%define temph bh
+%define nbitsq rcx
+%define nbits ecx
+%define nbitsb cl
+%define block rdx
+%define nbits_base rsi
+%define t rdi
+%define td edi
+%define codeq rbp
+%define code ebp
+%define dctbl r8
+%define actbl r9
+%define state r10
+%define index r11
+%define indexd r11d
+%define put_buffer r12
+%define put_bufferd r12d
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+
+%ifdef WIN64
+
+; rcx = working_state *state
+; rdx = JOCTET *buffer
+; r8 = JCOEFPTR block
+; r9 = int last_dc_val
+; [rax+48] = c_derived_tbl *dctbl
+; [rax+56] = c_derived_tbl *actbl
+
+ ;X: X = code stream
+ mov buffer, rdx
+ mov block, r8
+ movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
+ push rbx
+ push rbp
+ movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
+ push rsi
+ push rdi
+ push r12
+ movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
+ mov state, rcx
+ movsx code, word [block] ;Z: code = block[0];
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ sub code, r9d ;Z: code -= last_dc_val;
+ mov dctbl, POINTER [rsp+6*8+4*8]
+ mov actbl, POINTER [rsp+6*8+5*8]
+ punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
+ lea nbits_base, [rel jpeg_nbits_table]
+ add rsp, -DCTSIZE2 * SIZEOF_WORD
+ mov t, rsp
+
+%else
+
+; rdi = working_state *state
+; rsi = JOCTET *buffer
+; rdx = JCOEFPTR block
+; rcx = int last_dc_val
+; r8 = c_derived_tbl *dctbl
+; r9 = c_derived_tbl *actbl
+
+ ;X: X = code stream
+ movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
+ push rbx
+ push rbp
+ movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
+ push r12
+ mov state, rdi
+ mov buffer, rsi
+ movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
+ movsx codeq, word [block] ;Z: code = block[0];
+ lea nbits_base, [rel jpeg_nbits_table]
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ sub codeq, rcx ;Z: code -= last_dc_val;
+ punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
+ lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_
+
+%endif
+
+ pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
+ pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
+ punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
+ punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
+ pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
+ ;A: (Row 0, offset 1)
+ pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+ paddw xmm0, xmm4 ;A: w0[i] += w4[i];
+ movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
+
+ movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
+ pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
+ pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
+ movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
+ movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
+ punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
+ pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
+ pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+ pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
+ ; (Row 1, offset 1)
+ pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+ paddw xmm1, xmm4 ;B: w1[i] += w4[i];
+ movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
+ pxor xmm4, xmm4 ;B: w4[i] = 0;
+ pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+ packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+ ; w/ signed saturation
+
+ pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
+ pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
+ pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
+ pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
+ ; (Row 3, offset 1)
+ pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+ paddw xmm3, xmm4 ;D: w3[i] += w4[i];
+ movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
+ pxor xmm4, xmm4 ;D: w4[i] = 0;
+ pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+ pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
+ cmp code, 1 << 31 ;Z: Set CF if code < 0x80000000,
+ ;Z: i.e. if code is positive
+ pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
+ pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
+ adc code, -1 ;Z: code += -1 + (code >= 0 ? 1 : 0);
+ pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
+ pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
+ movsxd codeq, code ;Z: sign extend code
+ pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
+ ; (Row 2, offset 1)
+ pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+ paddw xmm2, xmm4 ;C: w2[i] += w4[i];
+ movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i];
+ pxor xmm4, xmm4 ;C: w4[i] = 0;
+ pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+
+ packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+ ; w/ signed saturation
+
+ movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code);
+ movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
+ pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i);
+ pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i);
+ movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
+ punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
+ shl tempd, 16 ;Z: temp <<= 16;
+ psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
+ pxor xmm2, xmm2 ;H: w2[i] = 0;
+ or put_bufferd, tempd ;Z: put_buffer |= temp;
+ pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
+ movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
+ unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
+ pxor xmm0, xmm0 ;H: w0[i] = 0;
+ pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
+ ; (Row 7, offset 1)
+ pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+ paddw xmm3, xmm2 ;H: w3[i] += w2[i];
+ movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
+ movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
+ pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+ punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
+ mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
+ ;Z: temp = dctbl->ehufco[nbits];
+ movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
+ psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
+ shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
+ and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1;
+ pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
+ pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
+ shl tempq, nbitsb ;Z: temp <<= nbits;
+ pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
+ pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
+ pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
+ or code, tempd ;Z: code |= temp;
+ movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
+ pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
+ pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
+ pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
+ ; (Row 6, offset 1)
+ pxor xmm2, xmm2 ;G: w2[i] = 0;
+ pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+ pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
+ paddw xmm4, xmm0 ;G: w4[i] += w0[i];
+ movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
+ pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
+ ; (Row 5, offset 1)
+ pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+ pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
+
+ packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+ ; w/ signed saturation
+
+ pxor xmm0, xmm0 ;F: w0[i] = 0;
+ pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
+ pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+ pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i);
+ pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
+ paddw xmm1, xmm2 ;F: w1[i] += w2[i];
+ movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
+ pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
+ ; (Row 4, offset 1)
+%undef block
+%define free_bitsq rdx
+%define free_bitsd edx
+%define free_bitsb dl
+ pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+ shl tempq, 48 ;Z: temp <<= 48;
+ pxor xmm2, xmm2 ;E: w2[i] = 0;
+ pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+ paddw xmm5, xmm0 ;E: w5[i] += w0[i];
+ or tempq, put_buffer ;Z: temp |= put_buffer;
+ movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
+ lea t, [dword t - 2] ;Z: t = &t[-1];
+ pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+ packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+ ; w/ signed saturation
+
+ add nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
+ ;Z: nbits += dctbl->ehufsi[nbits];
+%undef dctbl
+%define code_temp r8d
+ pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i);
+ mov free_bitsd, [state+working_state.cur.free_bits]
+ ;Z: free_bits = state->cur.free_bits;
+ pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF;
+ shl index, 32 ;Z: index <<= 32;
+ mov put_buffer, [state+working_state.cur.put_buffer.simd]
+ ;Z: put_buffer = state->cur.put_buffer.simd;
+ or index, tempq ;Z: index |= temp;
+ not index ;Z: index = ~index;
+ sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0)
+ jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE;
+ align 16
+.EMIT_CODE: ;Z: .EMIT_CODE:
+ EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.BRLOOP: ; do {
+ lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16;
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+ ; nbits = actbl->ehufsi[0xf0];
+ mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+ ; code = actbl->ehufco[0xf0];
+ sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE;
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ mov nbits, code_temp ; nbits = code_temp;
+ or put_buffer, codeq ; put_buffer |= code;
+ cmp nbits, 16 ; if (nbits <= 16)
+ jle .ERLOOP ; break;
+ jmp .BRLOOP ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+ times 5 nop
+.ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE:
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ or put_buffer, codeq ; put_buffer |= code;
+.BLOOP_COND: ; .BLOOP_COND:
+ test index, index ; if (index != 0)
+ jz .ELOOP ; {
+.BLOOP: ; do {
+ xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */
+ tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index
+ inc nbits ; ++nbits;
+ lea t, [t + nbitsq * 2] ; t = &t[nbits];
+ shr index, nbitsb ; index >>= nbits;
+.EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END:
+ cmp nbits, 16 ; if (nbits > 16)
+ jg .BRLOOP ; goto .BRLOOP;
+.ERLOOP: ; .ERLOOP:
+ movsx codeq, word [t] ; code = *t;
+ lea tempd, [nbitsq * 2] ; temp = nbits * 2;
+ movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code);
+ lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits;
+ mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
+ ; code_temp = actbl->ehufco[temp-16];
+ shl code_temp, nbitsb ; code_temp <<= nbits;
+ and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1;
+ add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
+ ; free_bits -= actbl->ehufsi[temp-16];
+ or code, code_temp ; code |= code_temp;
+ sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_CODE ; goto .EMIT_CODE;
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ or put_buffer, codeq ; put_buffer |= code;
+ test index, index
+ jnz .BLOOP ; } while (index != 0);
+.ELOOP: ; } /* index != 0 */
+ sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
+%ifdef WIN64
+ cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62)
+%else
+ cmp td, -2 * SIZEOF_WORD ; if (t != -2)
+%endif
+ je .EFN ; {
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+ ; nbits = actbl->ehufsi[0];
+ mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0];
+ sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
+ jg .EFN_SKIP_EMIT_CODE ; {
+ EMIT_QWORD .EFN ; insert code, flush buffer
+ align 16
+.EFN_SKIP_EMIT_CODE: ; } else {
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ or put_buffer, codeq ; put_buffer |= code;
+.EFN: ; } }
+ mov [state + working_state.cur.put_buffer.simd], put_buffer
+ ; state->cur.put_buffer.simd = put_buffer;
+ mov byte [state + working_state.cur.free_bits], free_bitsb
+ ; state->cur.free_bits = free_bits;
+%ifdef WIN64
+ sub rsp, -DCTSIZE2 * SIZEOF_WORD
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+%else
+ pop r12
+ pop rbp
+ pop rbx
+%endif
+ ret
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_BRLOOP_CODE:
+ EMIT_QWORD .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
+ ; insert code, flush buffer,
+ ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jcphuff-sse2.asm b/media/libjpeg/simd/x86_64/jcphuff-sse2.asm
new file mode 100644
index 0000000000..01b5c0235f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcphuff-sse2.asm
@@ -0,0 +1,639 @@
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
+; (64-bit SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding. See jcphuff.c for more details.
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+ pxor N0, N0
+ pxor N1, N1
+
+ mov T0d, INT [LUT + 0*SIZEOF_INT]
+ mov T1d, INT [LUT + 8*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+ pinsrw X1, word [BLOCK + T1 * 2], 0
+
+ mov T0d, INT [LUT + 1*SIZEOF_INT]
+ mov T1d, INT [LUT + 9*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+ pinsrw X1, word [BLOCK + T1 * 2], 1
+
+ mov T0d, INT [LUT + 2*SIZEOF_INT]
+ mov T1d, INT [LUT + 10*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+ pinsrw X1, word [BLOCK + T1 * 2], 2
+
+ mov T0d, INT [LUT + 3*SIZEOF_INT]
+ mov T1d, INT [LUT + 11*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+ pinsrw X1, word [BLOCK + T1 * 2], 3
+
+ mov T0d, INT [LUT + 4*SIZEOF_INT]
+ mov T1d, INT [LUT + 12*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+ pinsrw X1, word [BLOCK + T1 * 2], 4
+
+ mov T0d, INT [LUT + 5*SIZEOF_INT]
+ mov T1d, INT [LUT + 13*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+ pinsrw X1, word [BLOCK + T1 * 2], 5
+
+ mov T0d, INT [LUT + 6*SIZEOF_INT]
+ mov T1d, INT [LUT + 14*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+ pinsrw X1, word [BLOCK + T1 * 2], 6
+
+ mov T0d, INT [LUT + 7*SIZEOF_INT]
+ mov T1d, INT [LUT + 15*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+ pinsrw X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+ pxor N0, N0
+ pxor N1, N1
+ pxor X1, X1
+
+ mov T0d, INT [LUT + 0*SIZEOF_INT]
+ mov T1d, INT [LUT + 8*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+ pinsrw X1, word [BLOCK + T1 * 2], 0
+
+ mov T0d, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+
+ mov T0d, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+
+ mov T0d, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+
+ mov T0d, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+
+ mov T0d, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+
+ mov T0d, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+
+ mov T0d, INT [LUT + 7*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+
+ cmp LENEND, 2
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 9*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 1
+
+ cmp LENEND, 3
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 10*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 2
+
+ cmp LENEND, 4
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 11*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 3
+
+ cmp LENEND, 5
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 12*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 4
+
+ cmp LENEND, 6
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 13*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 5
+
+ cmp LENEND, 7
+ jl %%.ELOAD15
+ mov T1d, INT [LUT + 14*SIZEOF_INT]
+ pinsrw X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+ pxor N0, N0
+
+ mov T0d, INT [LUT + 0*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 0
+
+ mov T0d, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 1
+
+ mov T0d, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 2
+
+ mov T0d, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 3
+
+ mov T0d, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 4
+
+ mov T0d, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 5
+
+ mov T0d, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 6
+
+ mov T0d, INT [LUT + 7*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+ pxor N0, N0
+ pxor X0, X0
+
+ mov T1d, INT [LUT + 0*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 0
+
+ cmp LENEND, 2
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 1*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 1
+
+ cmp LENEND, 3
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 2*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 2
+
+ cmp LENEND, 4
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 3*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 3
+
+ cmp LENEND, 5
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 4*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 4
+
+ cmp LENEND, 6
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 5*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 5
+
+ cmp LENEND, 7
+ jl %%.ELOAD7
+ mov T1d, INT [LUT + 6*SIZEOF_INT]
+ pinsrw X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+ movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
+ movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
+ movdqa xmm2, XMMWORD [VALUES + (16*2)]
+ movdqa xmm3, XMMWORD [VALUES + (24*2)]
+ movdqa xmm4, XMMWORD [VALUES + (32*2)]
+ movdqa xmm5, XMMWORD [VALUES + (40*2)]
+ movdqa xmm6, XMMWORD [VALUES + (48*2)]
+ movdqa xmm7, XMMWORD [VALUES + (56*2)]
+
+ pcmpeqw xmm0, ZERO
+ pcmpeqw xmm1, ZERO
+ pcmpeqw xmm2, ZERO
+ pcmpeqw xmm3, ZERO
+ pcmpeqw xmm4, ZERO
+ pcmpeqw xmm5, ZERO
+ pcmpeqw xmm6, ZERO
+ pcmpeqw xmm7, ZERO
+
+ packsswb xmm0, xmm1
+ packsswb xmm2, xmm3
+ packsswb xmm4, xmm5
+ packsswb xmm6, xmm7
+
+ pmovmskb eax, xmm0
+ pmovmskb ecx, xmm2
+ pmovmskb edx, xmm4
+ pmovmskb esi, xmm6
+
+ shl rcx, 16
+ shl rdx, 32
+ shl rsi, 48
+
+ or rax, rcx
+ or rdx, rsi
+ or rax, rdx
+
+ not rax
+
+ mov MMWORD [r15], rax
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+; const int *jpeg_natural_order_start,
+; int Sl, int Al, JCOEF *values,
+; size_t *zerobits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *zerobits
+
+%define ZERO xmm9
+%define X0 xmm0
+%define X1 xmm1
+%define N0 xmm2
+%define N1 xmm3
+%define AL xmm4
+%define K eax
+%define LUT r11
+%define T0 rcx
+%define T0d ecx
+%define T1 rdx
+%define T1d edx
+%define BLOCK r10
+%define VALUES r14
+%define LEN r12d
+%define LENEND r13d
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [rbp - 16]
+ collect_args 6
+
+ movdqa XMMWORD [rbp - 16], ZERO
+
+ movd AL, r13d
+ pxor ZERO, ZERO
+ mov K, LEN
+ mov LENEND, LEN
+ and K, -16
+ and LENEND, 7
+ shr K, 4
+ jz .ELOOP16
+.BLOOP16:
+ LOAD16
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ pxor N0, X0
+ pxor N1, X1
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+ add VALUES, 16*2
+ add LUT, 16*SIZEOF_INT
+ dec K
+ jnz .BLOOP16
+ test LEN, 15
+ je .PADDING
+.ELOOP16:
+ test LEN, 8
+ jz .TRY7
+ test LEN, 7
+ jz .TRY8
+
+ LOAD15
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ pxor N0, X0
+ pxor N1, X1
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+ add VALUES, 16*2
+ jmp .PADDING
+.TRY8:
+ LOAD8
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ pxor N0, X0
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ add VALUES, 8*2
+ jmp .PADDING
+.TRY7:
+ LOAD7
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ pxor N0, X0
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+ add VALUES, 8*2
+.PADDING:
+ mov K, LEN
+ add K, 7
+ and K, -8
+ shr K, 3
+ sub K, DCTSIZE2/8
+ jz .EPADDING
+ align 16
+.ZEROLOOP:
+ movdqa XMMWORD [VALUES + 0], ZERO
+ add VALUES, 8*2
+ inc K
+ jnz .ZEROLOOP
+.EPADDING:
+ sub VALUES, DCTSIZE2*2
+
+ REDUCE0
+
+ movdqa ZERO, XMMWORD [rbp - 16]
+ uncollect_args 6
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+; const int *jpeg_natural_order_start,
+; int Sl, int Al, JCOEF *absvalues,
+; size_t *bits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *bits
+
+%define ZERO xmm9
+%define ONE xmm5
+%define X0 xmm0
+%define X1 xmm1
+%define N0 xmm2
+%define N1 xmm3
+%define AL xmm4
+%define K eax
+%define KK r9d
+%define EOB r8d
+%define SIGN rdi
+%define LUT r11
+%define T0 rcx
+%define T0d ecx
+%define T1 rdx
+%define T1d edx
+%define BLOCK r10
+%define VALUES r14
+%define LEN r12d
+%define LENEND r13d
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [rbp - 16]
+ collect_args 6
+
+ movdqa XMMWORD [rbp - 16], ZERO
+
+ xor SIGN, SIGN
+ xor EOB, EOB
+ xor KK, KK
+ movd AL, r13d
+ pxor ZERO, ZERO
+ pcmpeqw ONE, ONE
+ psrlw ONE, 15
+ mov K, LEN
+ mov LENEND, LEN
+ and K, -16
+ and LENEND, 7
+ shr K, 4
+ jz .ELOOPR16
+.BLOOPR16:
+ LOAD16
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ pcmpeqw X0, ONE
+ pcmpeqw X1, ONE
+ packsswb N0, N1
+ packsswb X0, X1
+ pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
+ shr SIGN, 16 ; make room for sizebits
+ shl T0, 48
+ or SIGN, T0
+ bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER16 ; if (idx) {
+ mov EOB, KK
+ add EOB, T1d ; EOB = k + idx;
+.CONTINUER16:
+ add VALUES, 16*2
+ add LUT, 16*SIZEOF_INT
+ add KK, 16
+ dec K
+ jnz .BLOOPR16
+ test LEN, 15
+ je .PADDINGR
+.ELOOPR16:
+ test LEN, 8
+ jz .TRYR7
+ test LEN, 7
+ jz .TRYR8
+
+ LOAD15
+ pcmpgtw N0, X0
+ pcmpgtw N1, X1
+ paddw X0, N0
+ paddw X1, N1
+ pxor X0, N0
+ pxor X1, N1
+ psrlw X0, AL
+ psrlw X1, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ movdqa XMMWORD [VALUES + (8) * 2], X1
+ pcmpeqw X0, ONE
+ pcmpeqw X1, ONE
+ packsswb N0, N1
+ packsswb X0, X1
+ pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
+ shr SIGN, 16 ; make room for sizebits
+ shl T0, 48
+ or SIGN, T0
+ bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER15 ; if (idx) {
+ mov EOB, KK
+ add EOB, T1d ; EOB = k + idx;
+.CONTINUER15:
+ add VALUES, 16*2
+ jmp .PADDINGR
+.TRYR8:
+ LOAD8
+
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ pcmpeqw X0, ONE
+ packsswb N0, ZERO
+ packsswb X0, ZERO
+ pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
+ shr SIGN, 8 ; make room for sizebits
+ shl T0, 56
+ or SIGN, T0
+ bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER8 ; if (idx) {
+ mov EOB, KK
+ add EOB, T1d ; EOB = k + idx;
+.CONTINUER8:
+ add VALUES, 8*2
+ jmp .PADDINGR
+.TRYR7:
+ LOAD7
+
+ pcmpgtw N0, X0
+ paddw X0, N0
+ pxor X0, N0
+ psrlw X0, AL
+ movdqa XMMWORD [VALUES + (0) * 2], X0
+ pcmpeqw X0, ONE
+ packsswb N0, ZERO
+ packsswb X0, ZERO
+ pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+ pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
+ shr SIGN, 8 ; make room for sizebits
+ shl T0, 56
+ or SIGN, T0
+ bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
+ jz .CONTINUER7 ; if (idx) {
+ mov EOB, KK
+ add EOB, T1d ; EOB = k + idx;
+.CONTINUER7:
+ add VALUES, 8*2
+.PADDINGR:
+ mov K, LEN
+ add K, 7
+ and K, -8
+ shr K, 3
+ sub K, DCTSIZE2/8
+ jz .EPADDINGR
+ align 16
+.ZEROLOOPR:
+ movdqa XMMWORD [VALUES + 0], ZERO
+ shr SIGN, 8
+ add VALUES, 8*2
+ inc K
+ jnz .ZEROLOOPR
+.EPADDINGR:
+ not SIGN
+ sub VALUES, DCTSIZE2*2
+ mov MMWORD [r15+SIZEOF_MMWORD], SIGN
+
+ REDUCE0
+
+ mov eax, EOB
+ movdqa ZERO, XMMWORD [rbp - 16]
+ uncollect_args 6
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jcsample-avx2.asm b/media/libjpeg/simd/x86_64/jcsample-avx2.asm
new file mode 100644
index 0000000000..b32527aebe
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcsample-avx2.asm
@@ -0,0 +1,367 @@
+;
+; jcsample.asm - downsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 6
+
+ mov ecx, r13d
+ shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
+ jz near .return
+
+ mov edx, r10d
+
+ ; -- expand_right_edge
+
+ push rcx
+ shl rcx, 1 ; output_cols * 2
+ sub rcx, rdx
+ jle short .expand_end
+
+ mov rax, r11
+ test rax, rax
+ jle short .expand_end
+
+ cld
+ mov rsi, r14 ; input_data
+.expandloop:
+ push rax
+ push rcx
+
+ mov rdip, JSAMPROW [rsi]
+ add rdi, rdx
+ mov al, JSAMPLE [rdi-1]
+
+ rep stosb
+
+ pop rcx
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ dec rax
+ jg short .expandloop
+
+.expand_end:
+ pop rcx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov eax, r12d ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov rdx, 0x00010000 ; bias pattern
+ vmovd xmm7, edx
+ vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+ vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
+ vpcmpeqw ymm6, ymm6, ymm6
+ vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov rsi, r14 ; input_data
+ mov rdi, r15 ; output_data
+.rowloop:
+ push rcx
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae short .columnloop
+
+.columnloop_r24:
+ ; rcx can possibly be 8, 16, 24
+ cmp rcx, 24
+ jne .columnloop_r16
+ vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+ mov rcx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r16:
+ cmp rcx, 16
+ jne .columnloop_r8
+ vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vpxor ymm1, ymm1, ymm1
+ mov rcx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r8:
+ vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
+ vpxor ymm1, ymm1, ymm1
+ mov rcx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop:
+ vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+ vpsrlw ymm2, ymm0, BYTE_BIT
+ vpand ymm0, ymm0, ymm6
+ vpsrlw ymm3, ymm1, BYTE_BIT
+ vpand ymm1, ymm1, ymm6
+
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm1, ymm1, ymm7
+ vpsrlw ymm0, ymm0, 1
+ vpsrlw ymm1, ymm1, 1
+
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8
+
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+ sub rcx, byte SIZEOF_YMMWORD ; outcol
+ add rsi, byte 2*SIZEOF_YMMWORD ; inptr
+ add rdi, byte 1*SIZEOF_YMMWORD ; outptr
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae short .columnloop
+ test rcx, rcx
+ jnz near .columnloop_r24
+
+ pop rsi
+ pop rdi
+ pop rcx
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rax ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ uncollect_args 6
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 6
+
+ mov ecx, r13d
+ shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
+ jz near .return
+
+ mov edx, r10d
+
+ ; -- expand_right_edge
+
+ push rcx
+ shl rcx, 1 ; output_cols * 2
+ sub rcx, rdx
+ jle short .expand_end
+
+ mov rax, r11
+ test rax, rax
+ jle short .expand_end
+
+ cld
+ mov rsi, r14 ; input_data
+.expandloop:
+ push rax
+ push rcx
+
+ mov rdip, JSAMPROW [rsi]
+ add rdi, rdx
+ mov al, JSAMPLE [rdi-1]
+
+ rep stosb
+
+ pop rcx
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ dec rax
+ jg short .expandloop
+
+.expand_end:
+ pop rcx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov eax, r12d ; rowctr
+ test rax, rax
+ jle near .return
+
+ mov rdx, 0x00020001 ; bias pattern
+ vmovd xmm7, edx
+ vpcmpeqw ymm6, ymm6, ymm6
+ vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+ vperm2i128 ymm7, ymm7, ymm7, 0
+ vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov rsi, r14 ; input_data
+ mov rdi, r15 ; output_data
+.rowloop:
+ push rcx
+ push rdi
+ push rsi
+
+ mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae short .columnloop
+
+.columnloop_r24:
+ cmp rcx, 24
+ jne .columnloop_r16
+ vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
+ vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+ mov rcx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r16:
+ cmp rcx, 16
+ jne .columnloop_r8
+ vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm3, ymm3, ymm3
+ mov rcx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop_r8:
+ vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm3, ymm3, ymm3
+ mov rcx, SIZEOF_YMMWORD
+ jmp short .downsample
+
+.columnloop:
+ vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+ vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+ vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+ vpand ymm4, ymm0, ymm6
+ vpsrlw ymm0, ymm0, BYTE_BIT
+ vpand ymm5, ymm1, ymm6
+ vpsrlw ymm1, ymm1, BYTE_BIT
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+
+ vpand ymm4, ymm2, ymm6
+ vpsrlw ymm2, ymm2, BYTE_BIT
+ vpand ymm5, ymm3, ymm6
+ vpsrlw ymm3, ymm3, BYTE_BIT
+ vpaddw ymm2, ymm2, ymm4
+ vpaddw ymm3, ymm3, ymm5
+
+ vpaddw ymm0, ymm0, ymm1
+ vpaddw ymm2, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm2, ymm2, ymm7
+ vpsrlw ymm0, ymm0, 2
+ vpsrlw ymm2, ymm2, 2
+
+ vpackuswb ymm0, ymm0, ymm2
+ vpermq ymm0, ymm0, 0xd8
+
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+ sub rcx, byte SIZEOF_YMMWORD ; outcol
+ add rdx, byte 2*SIZEOF_YMMWORD ; inptr0
+ add rsi, byte 2*SIZEOF_YMMWORD ; inptr1
+ add rdi, byte 1*SIZEOF_YMMWORD ; outptr
+ cmp rcx, byte SIZEOF_YMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .columnloop_r24
+
+ pop rsi
+ pop rdi
+ pop rcx
+
+ add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec rax ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ uncollect_args 6
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jcsample-sse2.asm b/media/libjpeg/simd/x86_64/jcsample-sse2.asm
new file mode 100644
index 0000000000..2fcfe4567a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcsample-sse2.asm
@@ -0,0 +1,330 @@
+;
+; jcsample.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 6
+
+ mov ecx, r13d
+ shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
+ jz near .return
+
+ mov edx, r10d
+
+ ; -- expand_right_edge
+
+ push rcx
+ shl rcx, 1 ; output_cols * 2
+ sub rcx, rdx
+ jle short .expand_end
+
+ mov rax, r11
+ test rax, rax
+ jle short .expand_end
+
+ cld
+ mov rsi, r14 ; input_data
+.expandloop:
+ push rax
+ push rcx
+
+ mov rdip, JSAMPROW [rsi]
+ add rdi, rdx
+ mov al, JSAMPLE [rdi-1]
+
+ rep stosb
+
+ pop rcx
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ dec rax
+ jg short .expandloop
+
+.expand_end:
+ pop rcx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov eax, r12d ; rowctr
+ test eax, eax
+ jle near .return
+
+ mov rdx, 0x00010000 ; bias pattern
+ movd xmm7, edx
+ pcmpeqw xmm6, xmm6
+ pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov rsi, r14 ; input_data
+ mov rdi, r15 ; output_data
+.rowloop:
+ push rcx
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+
+.columnloop_r8:
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ pxor xmm1, xmm1
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .downsample
+
+.columnloop:
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ pand xmm0, xmm6
+ psrlw xmm2, BYTE_BIT
+ pand xmm1, xmm6
+ psrlw xmm3, BYTE_BIT
+
+ paddw xmm0, xmm2
+ paddw xmm1, xmm3
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+ psrlw xmm0, 1
+ psrlw xmm1, 1
+
+ packuswb xmm0, xmm1
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+ sub rcx, byte SIZEOF_XMMWORD ; outcol
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr
+ add rdi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ test rcx, rcx
+ jnz short .columnloop_r8
+
+ pop rsi
+ pop rdi
+ pop rcx
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rax ; rowctr
+ jg near .rowloop
+
+.return:
+ uncollect_args 6
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor,
+; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+; JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 6
+
+ mov ecx, r13d
+ shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
+ jz near .return
+
+ mov edx, r10d
+
+ ; -- expand_right_edge
+
+ push rcx
+ shl rcx, 1 ; output_cols * 2
+ sub rcx, rdx
+ jle short .expand_end
+
+ mov rax, r11
+ test rax, rax
+ jle short .expand_end
+
+ cld
+ mov rsi, r14 ; input_data
+.expandloop:
+ push rax
+ push rcx
+
+ mov rdip, JSAMPROW [rsi]
+ add rdi, rdx
+ mov al, JSAMPLE [rdi-1]
+
+ rep stosb
+
+ pop rcx
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ dec rax
+ jg short .expandloop
+
+.expand_end:
+ pop rcx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov eax, r12d ; rowctr
+ test rax, rax
+ jle near .return
+
+ mov rdx, 0x00020001 ; bias pattern
+ movd xmm7, edx
+ pcmpeqw xmm6, xmm6
+ pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov rsi, r14 ; input_data
+ mov rdi, r15 ; output_data
+.rowloop:
+ push rcx
+ push rdi
+ push rsi
+
+ mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+
+.columnloop_r8:
+ movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ pxor xmm2, xmm2
+ pxor xmm3, xmm3
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .downsample
+
+.columnloop:
+ movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ pand xmm0, xmm6
+ psrlw xmm4, BYTE_BIT
+ pand xmm1, xmm6
+ psrlw xmm5, BYTE_BIT
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+ pand xmm2, xmm6
+ psrlw xmm4, BYTE_BIT
+ pand xmm3, xmm6
+ psrlw xmm5, BYTE_BIT
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+ paddw xmm0, xmm1
+ paddw xmm2, xmm3
+ paddw xmm0, xmm7
+ paddw xmm2, xmm7
+ psrlw xmm0, 2
+ psrlw xmm2, 2
+
+ packuswb xmm0, xmm2
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+ sub rcx, byte SIZEOF_XMMWORD ; outcol
+ add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
+ add rdi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .columnloop_r8
+
+ pop rsi
+ pop rdi
+ pop rcx
+
+ add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec rax ; rowctr
+ jg near .rowloop
+
+.return:
+ uncollect_args 6
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jdcolext-avx2.asm b/media/libjpeg/simd/x86_64/jdcolext-avx2.asm
new file mode 100644
index 0000000000..2370fda642
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolext-avx2.asm
@@ -0,0 +1,496 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+; JDIMENSION input_row, JSAMPARRAY output_buf,
+; int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 5
+ push rbx
+
+ mov ecx, r10d ; num_cols
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+ lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+ lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rdi, r13
+ mov eax, r14d
+ test rax, rax
+ jle near .return
+.rowloop:
+ push rax
+ push rdi
+ push rdx
+ push rbx
+ push rsi
+ push rcx ; col
+
+ mov rsip, JSAMPROW [rsi] ; inptr0
+ mov rbxp, JSAMPROW [rbx] ; inptr1
+ mov rdxp, JSAMPROW [rdx] ; inptr2
+ mov rdip, JSAMPROW [rdi] ; outptr
+.columnloop:
+
+ vmovdqu ymm5, YMMWORD [rbx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+ vmovdqu ymm1, YMMWORD [rdx] ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpcmpeqw ymm0, ymm0, ymm0
+ vpcmpeqw ymm7, ymm7, ymm7
+ vpsrlw ymm0, ymm0, BYTE_BIT ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+ vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpand ymm4, ymm0, ymm5 ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+ vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+ vpand ymm0, ymm0, ymm1 ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+ vpsrlw ymm1, ymm1, BYTE_BIT ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+ vpaddw ymm2, ymm4, ymm7
+ vpaddw ymm3, ymm5, ymm7
+ vpaddw ymm6, ymm0, ymm7
+ vpaddw ymm7, ymm1, ymm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbE
+ vpaddw ymm5, ymm3, ymm3 ; ymm5=2*CbO
+ vpaddw ymm0, ymm6, ymm6 ; ymm0=2*CrE
+ vpaddw ymm1, ymm7, ymm7 ; ymm1=2*CrO
+
+ vpmulhw ymm4, ymm4, [rel PW_MF0228] ; ymm4=(2*CbE * -FIX(0.22800))
+ vpmulhw ymm5, ymm5, [rel PW_MF0228] ; ymm5=(2*CbO * -FIX(0.22800))
+ vpmulhw ymm0, ymm0, [rel PW_F0402] ; ymm0=(2*CrE * FIX(0.40200))
+ vpmulhw ymm1, ymm1, [rel PW_F0402] ; ymm1=(2*CrO * FIX(0.40200))
+
+ vpaddw ymm4, ymm4, [rel PW_ONE]
+ vpaddw ymm5, ymm5, [rel PW_ONE]
+ vpsraw ymm4, ymm4, 1 ; ymm4=(CbE * -FIX(0.22800))
+ vpsraw ymm5, ymm5, 1 ; ymm5=(CbO * -FIX(0.22800))
+ vpaddw ymm0, ymm0, [rel PW_ONE]
+ vpaddw ymm1, ymm1, [rel PW_ONE]
+ vpsraw ymm0, ymm0, 1 ; ymm0=(CrE * FIX(0.40200))
+ vpsraw ymm1, ymm1, 1 ; ymm1=(CrO * FIX(0.40200))
+
+ vpaddw ymm4, ymm4, ymm2
+ vpaddw ymm5, ymm5, ymm3
+ vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+ vpaddw ymm5, ymm5, ymm3 ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+ vpaddw ymm0, ymm0, ymm6 ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+ vpaddw ymm1, ymm1, ymm7 ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ vmovdqa YMMWORD [wk(0)], ymm4 ; wk(0)=(B-Y)E
+ vmovdqa YMMWORD [wk(1)], ymm5 ; wk(1)=(B-Y)O
+
+ vpunpckhwd ymm4, ymm2, ymm6
+ vpunpcklwd ymm2, ymm2, ymm6
+ vpmaddwd ymm2, ymm2, [rel PW_MF0344_F0285]
+ vpmaddwd ymm4, ymm4, [rel PW_MF0344_F0285]
+ vpunpckhwd ymm5, ymm3, ymm7
+ vpunpcklwd ymm3, ymm3, ymm7
+ vpmaddwd ymm3, ymm3, [rel PW_MF0344_F0285]
+ vpmaddwd ymm5, ymm5, [rel PW_MF0344_F0285]
+
+ vpaddd ymm2, ymm2, [rel PD_ONEHALF]
+ vpaddd ymm4, ymm4, [rel PD_ONEHALF]
+ vpsrad ymm2, ymm2, SCALEBITS
+ vpsrad ymm4, ymm4, SCALEBITS
+ vpaddd ymm3, ymm3, [rel PD_ONEHALF]
+ vpaddd ymm5, ymm5, [rel PD_ONEHALF]
+ vpsrad ymm3, ymm3, SCALEBITS
+ vpsrad ymm5, ymm5, SCALEBITS
+
+ vpackssdw ymm2, ymm2, ymm4 ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ vpackssdw ymm3, ymm3, ymm5 ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ vpsubw ymm2, ymm2, ymm6 ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ vpsubw ymm3, ymm3, ymm7 ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ vmovdqu ymm5, YMMWORD [rsi] ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpcmpeqw ymm4, ymm4, ymm4
+ vpsrlw ymm4, ymm4, BYTE_BIT ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+ vpand ymm4, ymm4, ymm5 ; ymm4=Y(02468ACEGIKMOQSU)=YE
+ vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+ vpaddw ymm0, ymm0, ymm4 ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+ vpaddw ymm1, ymm1, ymm5 ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+ vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
+ vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm2, ymm2, ymm4 ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+ vpaddw ymm3, ymm3, ymm5 ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+ vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
+ vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm4, ymm4, YMMWORD [wk(0)] ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+ vpaddw ymm5, ymm5, YMMWORD [wk(1)] ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+ vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
+ vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+ ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+ ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+ ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+ vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+ ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+ vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+ ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+ ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+ vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+ ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+ vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+ ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+ vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+ ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+ vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+ ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+ vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+ ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+ vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+ ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+ vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+ ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+ vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+ vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+ vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+ ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+ vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+ ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+ vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+ ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+ vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+ ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+ vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+ vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+ vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test rdi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub rcx, byte SIZEOF_YMMWORD
+ jz near .nextrow
+
+ add rsi, byte SIZEOF_YMMWORD ; inptr0
+ add rbx, byte SIZEOF_YMMWORD ; inptr1
+ add rdx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st64:
+ lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp rcx, byte 2*SIZEOF_YMMWORD
+ jb short .column_st32
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmF
+ sub rcx, byte 2*SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st32:
+ cmp rcx, byte SIZEOF_YMMWORD
+ jb short .column_st31
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ add rdi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub rcx, byte SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st31:
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ sub rcx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ vmovq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_MMWORD
+ sub rcx, byte SIZEOF_MMWORD
+ vpsrldq xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_DWORD
+ jb short .column_st3
+ vmovd XMM_DWORD [rdi], xmmA
+ add rdi, byte SIZEOF_DWORD
+ sub rcx, byte SIZEOF_DWORD
+ vpsrldq xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of rax to the output when it has enough
+ ; space.
+ vmovd eax, xmmA
+ cmp rcx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [rdi], ax
+ add rdi, byte SIZEOF_WORD
+ sub rcx, byte SIZEOF_WORD
+ shr rax, 16
+.column_st1:
+ ; Store the lower 1 byte of rax to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .nextrow
+ mov byte [rdi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+ vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+ ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+ ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+ vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+ ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+ ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+ vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+ ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+ ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+ vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+ ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+ vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+ ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+ vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+ vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+ vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test rdi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+ vmovntdq YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+ vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+ add rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub rcx, byte SIZEOF_YMMWORD
+ jz near .nextrow
+
+ add rsi, byte SIZEOF_YMMWORD ; inptr0
+ add rbx, byte SIZEOF_YMMWORD ; inptr1
+ add rdx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st64:
+ cmp rcx, byte SIZEOF_YMMWORD/2
+ jb short .column_st32
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmC
+ vmovdqa ymmD, ymmH
+ sub rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+ cmp rcx, byte SIZEOF_YMMWORD/4
+ jb short .column_st16
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ add rdi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+ cmp rcx, byte SIZEOF_YMMWORD/8
+ jb short .column_st15
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+ ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_YMMWORD/16
+ jb short .column_st7
+ vmovq MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_YMMWORD/16*4
+ sub rcx, byte SIZEOF_YMMWORD/16
+ vpsrldq xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+ ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .nextrow
+ vmovd XMM_DWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+ pop rcx
+ pop rsi
+ pop rbx
+ pop rdx
+ pop rdi
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ add rbx, byte SIZEOF_JSAMPROW
+ add rdx, byte SIZEOF_JSAMPROW
+ add rdi, byte SIZEOF_JSAMPROW ; output_buf
+ dec rax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
+
+.return:
+ pop rbx
+ vzeroupper
+ uncollect_args 5
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jdcolext-sse2.asm b/media/libjpeg/simd/x86_64/jdcolext-sse2.asm
new file mode 100644
index 0000000000..e07c8d7518
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolext-sse2.asm
@@ -0,0 +1,439 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+; JDIMENSION input_row, JSAMPARRAY output_buf,
+; int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 5
+ push rbx
+
+ mov ecx, r10d ; num_cols
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+ lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+ lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rdi, r13
+ mov eax, r14d
+ test rax, rax
+ jle near .return
+.rowloop:
+ push rax
+ push rdi
+ push rdx
+ push rbx
+ push rsi
+ push rcx ; col
+
+ mov rsip, JSAMPROW [rsi] ; inptr0
+ mov rbxp, JSAMPROW [rbx] ; inptr1
+ mov rdxp, JSAMPROW [rdx] ; inptr2
+ mov rdip, JSAMPROW [rdi] ; outptr
+.columnloop:
+
+ movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
+ movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
+
+ pcmpeqw xmm4, xmm4
+ pcmpeqw xmm7, xmm7
+ psrlw xmm4, BYTE_BIT
+ psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+ movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
+ psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
+ pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
+ psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
+
+ paddw xmm4, xmm7
+ paddw xmm5, xmm7
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm2, xmm4 ; xmm2=CbE
+ movdqa xmm3, xmm5 ; xmm3=CbO
+ paddw xmm4, xmm4 ; xmm4=2*CbE
+ paddw xmm5, xmm5 ; xmm5=2*CbO
+ movdqa xmm6, xmm0 ; xmm6=CrE
+ movdqa xmm7, xmm1 ; xmm7=CrO
+ paddw xmm0, xmm0 ; xmm0=2*CrE
+ paddw xmm1, xmm1 ; xmm1=2*CrO
+
+ pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
+ pmulhw xmm5, [rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
+ pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
+ pmulhw xmm1, [rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
+
+ paddw xmm4, [rel PW_ONE]
+ paddw xmm5, [rel PW_ONE]
+ psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
+ psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
+ paddw xmm0, [rel PW_ONE]
+ paddw xmm1, [rel PW_ONE]
+ psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
+ psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
+
+ paddw xmm4, xmm2
+ paddw xmm5, xmm3
+ paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
+
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+ punpcklwd xmm2, xmm6
+ punpckhwd xmm4, xmm6
+ pmaddwd xmm2, [rel PW_MF0344_F0285]
+ pmaddwd xmm4, [rel PW_MF0344_F0285]
+ punpcklwd xmm3, xmm7
+ punpckhwd xmm5, xmm7
+ pmaddwd xmm3, [rel PW_MF0344_F0285]
+ pmaddwd xmm5, [rel PW_MF0344_F0285]
+
+ paddd xmm2, [rel PD_ONEHALF]
+ paddd xmm4, [rel PD_ONEHALF]
+ psrad xmm2, SCALEBITS
+ psrad xmm4, SCALEBITS
+ paddd xmm3, [rel PD_ONEHALF]
+ paddd xmm5, [rel PD_ONEHALF]
+ psrad xmm3, SCALEBITS
+ psrad xmm5, SCALEBITS
+
+ packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm4, xmm4
+ psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
+ psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
+
+ paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+ paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+ packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+ paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+ packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+ paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+ packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG, xmmA
+ movdqa xmmH, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC, xmmD
+ movdqa xmmB, xmmD
+ punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF, xmmE
+ punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB, xmmE
+ punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB, xmmF
+ punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st32:
+ lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp rcx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmF
+ sub rcx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub rcx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_MMWORD
+ sub rcx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd XMM_DWORD [rdi], xmmA
+ add rdi, byte SIZEOF_DWORD
+ sub rcx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of rax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp rcx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [rdi], ax
+ add rdi, byte SIZEOF_WORD
+ sub rcx, byte SIZEOF_WORD
+ shr rax, 16
+.column_st1:
+ ; Store the lower 1 byte of rax to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .nextrow
+ mov byte [rdi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG, xmmB
+ punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH, xmmC
+ punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st32:
+ cmp rcx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmC
+ movdqa xmmD, xmmH
+ sub rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp rcx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD/8*4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .nextrow
+ movd XMM_DWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+ pop rcx
+ pop rsi
+ pop rbx
+ pop rdx
+ pop rdi
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ add rbx, byte SIZEOF_JSAMPROW
+ add rdx, byte SIZEOF_JSAMPROW
+ add rdi, byte SIZEOF_JSAMPROW ; output_buf
+ dec rax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
+
+.return:
+ pop rbx
+ uncollect_args 5
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jdcolor-avx2.asm b/media/libjpeg/simd/x86_64/jdcolor-avx2.asm
new file mode 100644
index 0000000000..43de9db04d
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolor-avx2.asm
@@ -0,0 +1,118 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402 times 16 dw F_0_402
+PW_MF0228 times 16 dw -F_0_228
+PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
+PW_ONE times 16 dw 1
+PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdcolor-sse2.asm b/media/libjpeg/simd/x86_64/jdcolor-sse2.asm
new file mode 100644
index 0000000000..b3f1fec07e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolor-sse2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402 times 8 dw F_0_402
+PW_MF0228 times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE times 8 dw 1
+PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmerge-avx2.asm b/media/libjpeg/simd/x86_64/jdmerge-avx2.asm
new file mode 100644
index 0000000000..9515a17013
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmerge-avx2.asm
@@ -0,0 +1,136 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402 times 16 dw F_0_402
+PW_MF0228 times 16 dw -F_0_228
+PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
+PW_ONE times 16 dw 1
+PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+ jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+ jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmerge-sse2.asm b/media/libjpeg/simd/x86_64/jdmerge-sse2.asm
new file mode 100644
index 0000000000..aedccc20f6
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmerge-sse2.asm
@@ -0,0 +1,135 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402 times 8 dw F_0_402
+PW_MF0228 times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE times 8 dw 1
+PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+ jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+ jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm b/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm
new file mode 100644
index 0000000000..8b264b4f03
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm
@@ -0,0 +1,596 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
+%define WK_NUM 3
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 4
+ push rbx
+
+ mov ecx, r10d ; col
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rdi, r13
+ mov rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
+ mov rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
+ mov rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ pop rcx ; col
+
+.columnloop:
+
+ vmovdqu ymm6, YMMWORD [rbx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+ vmovdqu ymm7, YMMWORD [rdx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
+ vpcmpeqw ymm3, ymm3, ymm3
+ vpsllw ymm3, ymm3, 7 ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpermq ymm6, ymm6, 0xd8 ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+ vpermq ymm7, ymm7, 0xd8 ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+ vpunpcklbw ymm4, ymm6, ymm1 ; ymm4=Cb(0123456789ABCDEF)=CbL
+ vpunpckhbw ymm6, ymm6, ymm1 ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+ vpunpcklbw ymm0, ymm7, ymm1 ; ymm0=Cr(0123456789ABCDEF)=CrL
+ vpunpckhbw ymm7, ymm7, ymm1 ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+ vpaddw ymm5, ymm6, ymm3
+ vpaddw ymm2, ymm4, ymm3
+ vpaddw ymm1, ymm7, ymm3
+ vpaddw ymm3, ymm0, ymm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ vpaddw ymm6, ymm5, ymm5 ; ymm6=2*CbH
+ vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbL
+ vpaddw ymm7, ymm1, ymm1 ; ymm7=2*CrH
+ vpaddw ymm0, ymm3, ymm3 ; ymm0=2*CrL
+
+ vpmulhw ymm6, ymm6, [rel PW_MF0228] ; ymm6=(2*CbH * -FIX(0.22800))
+ vpmulhw ymm4, ymm4, [rel PW_MF0228] ; ymm4=(2*CbL * -FIX(0.22800))
+ vpmulhw ymm7, ymm7, [rel PW_F0402] ; ymm7=(2*CrH * FIX(0.40200))
+ vpmulhw ymm0, ymm0, [rel PW_F0402] ; ymm0=(2*CrL * FIX(0.40200))
+
+ vpaddw ymm6, ymm6, [rel PW_ONE]
+ vpaddw ymm4, ymm4, [rel PW_ONE]
+ vpsraw ymm6, ymm6, 1 ; ymm6=(CbH * -FIX(0.22800))
+ vpsraw ymm4, ymm4, 1 ; ymm4=(CbL * -FIX(0.22800))
+ vpaddw ymm7, ymm7, [rel PW_ONE]
+ vpaddw ymm0, ymm0, [rel PW_ONE]
+ vpsraw ymm7, ymm7, 1 ; ymm7=(CrH * FIX(0.40200))
+ vpsraw ymm0, ymm0, 1 ; ymm0=(CrL * FIX(0.40200))
+
+ vpaddw ymm6, ymm6, ymm5
+ vpaddw ymm4, ymm4, ymm2
+ vpaddw ymm6, ymm6, ymm5 ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+ vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+ vpaddw ymm7, ymm7, ymm1 ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+ vpaddw ymm0, ymm0, ymm3 ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ vmovdqa YMMWORD [wk(0)], ymm6 ; wk(0)=(B-Y)H
+ vmovdqa YMMWORD [wk(1)], ymm7 ; wk(1)=(R-Y)H
+
+ vpunpckhwd ymm6, ymm5, ymm1
+ vpunpcklwd ymm5, ymm5, ymm1
+ vpmaddwd ymm5, ymm5, [rel PW_MF0344_F0285]
+ vpmaddwd ymm6, ymm6, [rel PW_MF0344_F0285]
+ vpunpckhwd ymm7, ymm2, ymm3
+ vpunpcklwd ymm2, ymm2, ymm3
+ vpmaddwd ymm2, ymm2, [rel PW_MF0344_F0285]
+ vpmaddwd ymm7, ymm7, [rel PW_MF0344_F0285]
+
+ vpaddd ymm5, ymm5, [rel PD_ONEHALF]
+ vpaddd ymm6, ymm6, [rel PD_ONEHALF]
+ vpsrad ymm5, ymm5, SCALEBITS
+ vpsrad ymm6, ymm6, SCALEBITS
+ vpaddd ymm2, ymm2, [rel PD_ONEHALF]
+ vpaddd ymm7, ymm7, [rel PD_ONEHALF]
+ vpsrad ymm2, ymm2, SCALEBITS
+ vpsrad ymm7, ymm7, SCALEBITS
+
+ vpackssdw ymm5, ymm5, ymm6 ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ vpackssdw ymm2, ymm2, ymm7 ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ vpsubw ymm5, ymm5, ymm1 ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ vpsubw ymm2, ymm2, ymm3 ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ vmovdqa YMMWORD [wk(2)], ymm5 ; wk(2)=(G-Y)H
+
+ mov al, 2 ; Yctr
+ jmp short .Yloop_1st
+
+.Yloop_2nd:
+ vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H
+ vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H
+ vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H
+
+.Yloop_1st:
+ vmovdqu ymm7, YMMWORD [rsi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+ vpcmpeqw ymm6, ymm6, ymm6
+ vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+ vpand ymm6, ymm6, ymm7 ; ymm6=Y(02468ACEGIKMOQSU)=YE
+ vpsrlw ymm7, ymm7, BYTE_BIT ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+ vmovdqa ymm1, ymm0 ; ymm1=ymm0=(R-Y)(L/H)
+ vmovdqa ymm3, ymm2 ; ymm3=ymm2=(G-Y)(L/H)
+ vmovdqa ymm5, ymm4 ; ymm5=ymm4=(B-Y)(L/H)
+
+ vpaddw ymm0, ymm0, ymm6 ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+ vpaddw ymm1, ymm1, ymm7 ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+ vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
+ vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm2, ymm2, ymm6 ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+ vpaddw ymm3, ymm3, ymm7 ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+ vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
+ vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
+
+ vpaddw ymm4, ymm4, ymm6 ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+ vpaddw ymm5, ymm5, ymm7 ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+ vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
+ vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+ ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+ ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+ vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+ ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+ vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+ ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+ vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+ ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+ ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+ vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+ ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+ vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+ ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+ vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+ ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+ vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+ ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+ vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+ ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+ vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+ ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+ vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+ ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+ vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+ vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+ vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+ ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+ vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+ ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+ vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+ ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+ vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+ ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+ vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+ vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+ ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+ vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+ ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+ ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+ vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+ ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test rdi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub rcx, byte SIZEOF_YMMWORD
+ jz near .endcolumn
+
+ add rsi, byte SIZEOF_YMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add rbx, byte SIZEOF_YMMWORD ; inptr1
+ add rdx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st64:
+ lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp rcx, byte 2*SIZEOF_YMMWORD
+ jb short .column_st32
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmF
+ sub rcx, byte 2*SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st32:
+ cmp rcx, byte SIZEOF_YMMWORD
+ jb short .column_st31
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ add rdi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub rcx, byte SIZEOF_YMMWORD
+ jmp short .column_st31
+.column_st31:
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ sub rcx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ vmovq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_MMWORD
+ sub rcx, byte SIZEOF_MMWORD
+ vpsrldq xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_DWORD
+ jb short .column_st3
+ vmovd XMM_DWORD [rdi], xmmA
+ add rdi, byte SIZEOF_DWORD
+ sub rcx, byte SIZEOF_DWORD
+ vpsrldq xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of rax to the output when it has enough
+ ; space.
+ vmovd eax, xmmA
+ cmp rcx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [rdi], ax
+ add rdi, byte SIZEOF_WORD
+ sub rcx, byte SIZEOF_WORD
+ shr rax, 16
+.column_st1:
+ ; Store the lower 1 byte of rax to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .endcolumn
+ mov byte [rdi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+ vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+ vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+ ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+ ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+ ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+ ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+ ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+ ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+ ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+ ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+ vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+ vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+ ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+ vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+ ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+ vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+ ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+ vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+ ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+ vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+ ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+ vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+ ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+ vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+ ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+ vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+ vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+ vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+ vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+ ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+ vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+ ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+ vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+ ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+ cmp rcx, byte SIZEOF_YMMWORD
+ jb short .column_st64
+
+ test rdi, SIZEOF_YMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+ vmovntdq YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+ vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+ add rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
+ sub rcx, byte SIZEOF_YMMWORD
+ jz near .endcolumn
+
+ add rsi, byte SIZEOF_YMMWORD ; inptr0
+ dec al
+ jnz near .Yloop_2nd
+
+ add rbx, byte SIZEOF_YMMWORD ; inptr1
+ add rdx, byte SIZEOF_YMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st64:
+ cmp rcx, byte SIZEOF_YMMWORD/2
+ jb short .column_st32
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmC
+ vmovdqa ymmD, ymmH
+ sub rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+ cmp rcx, byte SIZEOF_YMMWORD/4
+ jb short .column_st16
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+ add rdi, byte SIZEOF_YMMWORD ; outptr
+ vmovdqa ymmA, ymmD
+ sub rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+ cmp rcx, byte SIZEOF_YMMWORD/8
+ jb short .column_st15
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ vperm2i128 ymmA, ymmA, ymmA, 1
+ sub rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+ ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_YMMWORD/16
+ jb short .column_st7
+ vmovq MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_YMMWORD/16*4
+ sub rcx, byte SIZEOF_YMMWORD/16
+ vpsrldq xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+ ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .endcolumn
+ vmovd XMM_DWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ sfence ; flush the write buffer
+
+.return:
+ pop rbx
+ vzeroupper
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+ push rbx
+
+ mov eax, r10d
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rdi, r13
+ lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+ sub rsp, SIZEOF_JSAMPARRAY*4
+ mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00
+ mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1
+ mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2
+ mov rbx, rsp
+
+ push rdi
+ push rcx
+ push rax
+
+ %ifdef WIN64
+ mov r8, rcx
+ mov r9, rdi
+ mov rcx, rax
+ mov rdx, rbx
+ %else
+ mov rdx, rcx
+ mov rcx, rdi
+ mov rdi, rax
+ mov rsi, rbx
+ %endif
+
+ call EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+ pop rax
+ pop rcx
+ pop rdi
+ mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+
+ add rdi, byte SIZEOF_JSAMPROW ; outptr1
+ add rsi, byte SIZEOF_JSAMPROW ; inptr01
+
+ mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00
+ mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1
+ mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2
+ mov rbx, rsp
+
+ push rdi
+ push rcx
+ push rax
+
+ %ifdef WIN64
+ mov r8, rcx
+ mov r9, rdi
+ mov rcx, rax
+ mov rdx, rbx
+ %else
+ mov rdx, rcx
+ mov rcx, rdi
+ mov rdi, rax
+ mov rsi, rbx
+ %endif
+
+ call EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+ pop rax
+ pop rcx
+ pop rdi
+ mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+ add rsp, SIZEOF_JSAMPARRAY*4
+
+ pop rbx
+ uncollect_args 4
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm b/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm
new file mode 100644
index 0000000000..eb3ab9dbd9
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm
@@ -0,0 +1,538 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 3
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 4
+ push rbx
+
+ mov ecx, r10d ; col
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rdi, r13
+ mov rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
+ mov rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
+ mov rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ pop rcx ; col
+
+.columnloop:
+
+ movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
+ movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
+
+ pxor xmm1, xmm1 ; xmm1=(all 0's)
+ pcmpeqw xmm3, xmm3
+ psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ movdqa xmm4, xmm6
+ punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
+ punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
+ movdqa xmm0, xmm7
+ punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
+ punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
+
+ paddw xmm6, xmm3
+ paddw xmm4, xmm3
+ paddw xmm7, xmm3
+ paddw xmm0, xmm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm5, xmm6 ; xmm5=CbH
+ movdqa xmm2, xmm4 ; xmm2=CbL
+ paddw xmm6, xmm6 ; xmm6=2*CbH
+ paddw xmm4, xmm4 ; xmm4=2*CbL
+ movdqa xmm1, xmm7 ; xmm1=CrH
+ movdqa xmm3, xmm0 ; xmm3=CrL
+ paddw xmm7, xmm7 ; xmm7=2*CrH
+ paddw xmm0, xmm0 ; xmm0=2*CrL
+
+ pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
+ pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
+ pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
+ pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
+
+ paddw xmm6, [rel PW_ONE]
+ paddw xmm4, [rel PW_ONE]
+ psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
+ psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
+ paddw xmm7, [rel PW_ONE]
+ paddw xmm0, [rel PW_ONE]
+ psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
+ psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
+
+ paddw xmm6, xmm5
+ paddw xmm4, xmm2
+ paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+ paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+ paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+ paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
+
+ movdqa xmm6, xmm5
+ movdqa xmm7, xmm2
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm6, xmm1
+ pmaddwd xmm5, [rel PW_MF0344_F0285]
+ pmaddwd xmm6, [rel PW_MF0344_F0285]
+ punpcklwd xmm2, xmm3
+ punpckhwd xmm7, xmm3
+ pmaddwd xmm2, [rel PW_MF0344_F0285]
+ pmaddwd xmm7, [rel PW_MF0344_F0285]
+
+ paddd xmm5, [rel PD_ONEHALF]
+ paddd xmm6, [rel PD_ONEHALF]
+ psrad xmm5, SCALEBITS
+ psrad xmm6, SCALEBITS
+ paddd xmm2, [rel PD_ONEHALF]
+ paddd xmm7, [rel PD_ONEHALF]
+ psrad xmm2, SCALEBITS
+ psrad xmm7, SCALEBITS
+
+ packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
+
+ mov al, 2 ; Yctr
+ jmp short .Yloop_1st
+
+.Yloop_2nd:
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
+
+.Yloop_1st:
+ movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm6, xmm6
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
+ psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
+
+ movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
+ movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
+ movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
+
+ paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+ paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+ packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+ paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+ packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+ paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+ packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG, xmmA
+ movdqa xmmH, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC, xmmD
+ movdqa xmmB, xmmD
+ punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF, xmmE
+ punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB, xmmE
+ punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB, xmmF
+ punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
+
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st32:
+ lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp rcx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmF
+ sub rcx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub rcx, byte SIZEOF_XMMWORD
+.column_st15:
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_MMWORD
+ sub rcx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd XMM_DWORD [rdi], xmmA
+ add rdi, byte SIZEOF_DWORD
+ sub rcx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of rax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp rcx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov word [rdi], ax
+ add rdi, byte SIZEOF_WORD
+ sub rcx, byte SIZEOF_WORD
+ shr rax, 16
+.column_st1:
+ ; Store the lower 1 byte of rax to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .endcolumn
+ mov byte [rdi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG, xmmB
+ punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH, xmmC
+ punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
+
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st32:
+ cmp rcx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmC
+ movdqa xmmD, xmmH
+ sub rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp rcx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD/8*4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .endcolumn
+ movd XMM_DWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ sfence ; flush the write buffer
+
+.return:
+ pop rbx
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+ push rbx
+
+ mov eax, r10d
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rdi, r13
+ lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+ sub rsp, SIZEOF_JSAMPARRAY*4
+ mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00
+ mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1
+ mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2
+ mov rbx, rsp
+
+ push rdi
+ push rcx
+ push rax
+
+ %ifdef WIN64
+ mov r8, rcx
+ mov r9, rdi
+ mov rcx, rax
+ mov rdx, rbx
+ %else
+ mov rdx, rcx
+ mov rcx, rdi
+ mov rdi, rax
+ mov rsi, rbx
+ %endif
+
+ call EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ pop rax
+ pop rcx
+ pop rdi
+ mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+
+ add rdi, byte SIZEOF_JSAMPROW ; outptr1
+ add rsi, byte SIZEOF_JSAMPROW ; inptr01
+
+ mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00
+ mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1
+ mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2
+ mov rbx, rsp
+
+ push rdi
+ push rcx
+ push rax
+
+ %ifdef WIN64
+ mov r8, rcx
+ mov r9, rdi
+ mov rcx, rax
+ mov rdx, rbx
+ %else
+ mov rdx, rcx
+ mov rcx, rdi
+ mov rdi, rax
+ mov rsi, rbx
+ %endif
+
+ call EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ pop rax
+ pop rcx
+ pop rdi
+ mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+ add rsp, SIZEOF_JSAMPARRAY*4
+
+ pop rbx
+ uncollect_args 4
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jdsample-avx2.asm b/media/libjpeg/simd/x86_64/jdsample-avx2.asm
new file mode 100644
index 0000000000..1e4979f933
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdsample-avx2.asm
@@ -0,0 +1,696 @@
+;
+; jdsample.asm - upsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE times 16 dw 1
+PW_TWO times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ push_xmm 3
+ collect_args 4
+
+ mov eax, r11d ; colctr
+ test rax, rax
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+
+ vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
+ vpcmpeqb xmm9, xmm9, xmm9
+ vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff
+
+ vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-1)
+ vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ff) MSB is ff
+
+.rowloop:
+ push rax ; colctr
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ test rax, SIZEOF_YMMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ vpand ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+ add rax, byte SIZEOF_YMMWORD-1
+ and rax, byte -SIZEOF_YMMWORD
+ cmp rax, byte SIZEOF_YMMWORD
+ ja short .columnloop
+
+.columnloop_last:
+ vpand ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+ jmp short .upsample
+
+.columnloop:
+ vmovdqu ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+ vperm2i128 ymm6, ymm0, ymm6, 0x20
+ vpslldq ymm6, ymm6, 15
+
+.upsample:
+ vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31)
+
+ vperm2i128 ymm2, ymm0, ymm1, 0x20
+ vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30)
+ vperm2i128 ymm4, ymm0, ymm1, 0x03
+ vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --)
+
+ vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30)
+ vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32)
+
+ vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --)
+
+ vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+ vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22)
+ vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24)
+ vpunpcklbw ymm8, ymm3, ymm0 ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+ vperm2i128 ymm3, ymm8, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
+ vperm2i128 ymm6, ymm8, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+ vpmullw ymm1, ymm1, [rel PW_THREE]
+ vpmullw ymm4, ymm4, [rel PW_THREE]
+ vpaddw ymm2, ymm2, [rel PW_ONE]
+ vpaddw ymm5, ymm5, [rel PW_ONE]
+ vpaddw ymm3, ymm3, [rel PW_TWO]
+ vpaddw ymm6, ymm6, [rel PW_TWO]
+
+ vpaddw ymm2, ymm2, ymm1
+ vpaddw ymm5, ymm5, ymm4
+ vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
+ vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+ vpaddw ymm3, ymm3, ymm1
+ vpaddw ymm6, ymm6, ymm4
+ vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
+ vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+ vpsllw ymm3, ymm3, BYTE_BIT
+ vpsllw ymm6, ymm6, BYTE_BIT
+ vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31)
+ vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
+
+ sub rax, byte SIZEOF_YMMWORD
+ add rsi, byte 1*SIZEOF_YMMWORD ; inptr
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
+ cmp rax, byte SIZEOF_YMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop rsi
+ pop rdi
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rcx ; rowctr
+ jg near .rowloop
+
+.return:
+ vzeroupper
+ uncollect_args 4
+ pop_xmm 3
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
+%define WK_NUM 4
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ push_xmm 3
+ collect_args 4
+ push rbx
+
+ mov eax, r11d ; colctr
+ test rax, rax
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rax ; colctr
+ push rcx
+ push rdi
+ push rsi
+
+ mov rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ vpxor ymm8, ymm8, ymm8 ; ymm8=(all 0's)
+ vpcmpeqb xmm9, xmm9, xmm9
+ vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff
+ vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-2)
+ vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+ test rax, SIZEOF_YMMWORD-1
+ jz short .skip
+ push rdx
+ mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop rdx
+.skip:
+ ; -- process the first column block
+
+ vmovdqu ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0]
+ vmovdqu ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0]
+ vmovdqu ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0]
+
+ vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm3, ymm2, ymm8 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpmullw ymm0, ymm0, [rel PW_THREE]
+ vpmullw ymm4, ymm4, [rel PW_THREE]
+
+ vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+ vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save
+ vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
+
+ vpand ymm1, ymm1, ymm10 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vpand ymm2, ymm2, ymm10 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+ vmovdqa YMMWORD [wk(0)], ymm1
+ vmovdqa YMMWORD [wk(1)], ymm2
+
+ add rax, byte SIZEOF_YMMWORD-1
+ and rax, byte -SIZEOF_YMMWORD
+ cmp rax, byte SIZEOF_YMMWORD
+ ja short .columnloop
+
+.columnloop_last:
+ ; -- process the last column block
+
+ vpand ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+ vpand ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD]
+
+ vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+ vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+ jmp near .upsample
+
+.columnloop:
+ ; -- process the next column block
+
+ vmovdqu ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1]
+ vmovdqu ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1]
+ vmovdqu ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1]
+
+ vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+ vpunpcklbw ymm7, ymm2, ymm8 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
+ vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vpmullw ymm0, ymm0, [rel PW_THREE]
+ vpmullw ymm4, ymm4, [rel PW_THREE]
+
+ vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+ vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vmovdqu YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save
+ vmovdqu YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data
+ vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
+ vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
+
+ vperm2i128 ymm1, ymm8, ymm1, 0x20
+ vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
+ vperm2i128 ymm2, ymm8, ymm2, 0x20
+ vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
+
+ vmovdqa YMMWORD [wk(2)], ymm1
+ vmovdqa YMMWORD [wk(3)], ymm2
+
+.upsample:
+ ; -- process the upper row
+
+ vmovdqu ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vmovdqu ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vperm2i128 ymm0, ymm8, ymm7, 0x03
+ vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
+ vperm2i128 ymm4, ymm8, ymm3, 0x20
+ vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+ vperm2i128 ymm5, ymm8, ymm7, 0x03
+ vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm6, ymm8, ymm3, 0x20
+ vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
+ vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vperm2i128 ymm2, ymm8, ymm3, 0x03
+ vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+ vperm2i128 ymm4, ymm8, ymm3, 0x03
+ vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm1, ymm8, ymm7, 0x20
+ vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+
+ vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+ vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+ vmovdqa YMMWORD [wk(0)], ymm4
+
+ vpmullw ymm7, ymm7, [rel PW_THREE]
+ vpmullw ymm3, ymm3, [rel PW_THREE]
+ vpaddw ymm1, ymm1, [rel PW_EIGHT]
+ vpaddw ymm5, ymm5, [rel PW_EIGHT]
+ vpaddw ymm0, ymm0, [rel PW_SEVEN]
+ vpaddw ymm2, [rel PW_SEVEN]
+
+ vpaddw ymm1, ymm1, ymm7
+ vpaddw ymm5, ymm5, ymm3
+ vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
+ vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm2, ymm2, ymm3
+ vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
+ vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+ vpsllw ymm0, ymm0, BYTE_BIT
+ vpsllw ymm2, ymm2, BYTE_BIT
+ vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31)
+ vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+ vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
+ vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
+
+ ; -- process the lower row
+
+ vmovdqu ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+ vmovdqu ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+ vperm2i128 ymm7, ymm8, ymm6, 0x03
+ vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
+ vperm2i128 ymm3, ymm8, ymm4, 0x20
+ vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+ vperm2i128 ymm0, ymm8, ymm6, 0x03
+ vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm2, ymm8, ymm4, 0x20
+ vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
+ vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+ vperm2i128 ymm5, ymm8, ymm4, 0x03
+ vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+ vperm2i128 ymm3, ymm8, ymm4, 0x03
+ vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+ vperm2i128 ymm1, ymm8, ymm6, 0x20
+ vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+
+ vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+ vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+ vmovdqa YMMWORD [wk(1)], ymm3
+
+ vpmullw ymm6, ymm6, [rel PW_THREE]
+ vpmullw ymm4, ymm4, [rel PW_THREE]
+ vpaddw ymm1, ymm1, [rel PW_EIGHT]
+ vpaddw ymm0, ymm0, [rel PW_EIGHT]
+ vpaddw ymm7, ymm7, [rel PW_SEVEN]
+ vpaddw ymm5, ymm5, [rel PW_SEVEN]
+
+ vpaddw ymm1, ymm1, ymm6
+ vpaddw ymm0, ymm0, ymm4
+ vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
+ vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+ vpaddw ymm7, ymm7, ymm6
+ vpaddw ymm5, ymm5, ymm4
+ vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
+ vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+ vpsllw ymm7, ymm7, BYTE_BIT
+ vpsllw ymm5, ymm5, BYTE_BIT
+ vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31)
+ vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
+
+ sub rax, byte SIZEOF_YMMWORD
+ add rcx, byte 1*SIZEOF_YMMWORD ; inptr1(above)
+ add rbx, byte 1*SIZEOF_YMMWORD ; inptr0
+ add rsi, byte 1*SIZEOF_YMMWORD ; inptr1(below)
+ add rdx, byte 2*SIZEOF_YMMWORD ; outptr0
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr1
+ cmp rax, byte SIZEOF_YMMWORD
+ ja near .columnloop
+ test rax, rax
+ jnz near .columnloop_last
+
+ pop rsi
+ pop rdi
+ pop rcx
+ pop rax
+
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub rcx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ pop rbx
+ vzeroupper
+ uncollect_args 4
+ pop_xmm 3
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+
+ mov edx, r11d
+ add rdx, byte (SIZEOF_YMMWORD-1)
+ and rdx, -SIZEOF_YMMWORD
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz short .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
+ mov rax, rdx ; colctr
+.columnloop:
+
+ cmp rax, byte SIZEOF_YMMWORD
+ ja near .above_16
+
+ vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
+ vpunpckhbw xmm1, xmm0, xmm0
+ vpunpcklbw xmm0, xmm0, xmm0
+
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+ vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+ jmp short .nextrow
+
+.above_16:
+ vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+ vpermq ymm0, ymm0, 0xd8
+ vpunpckhbw ymm1, ymm0, ymm0
+ vpunpcklbw ymm0, ymm0, ymm0
+
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+ sub rax, byte 2*SIZEOF_YMMWORD
+ jz short .nextrow
+
+ add rsi, byte SIZEOF_YMMWORD ; inptr
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
+ jmp short .columnloop
+
+.nextrow:
+ pop rsi
+ pop rdi
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rcx ; rowctr
+ jg short .rowloop
+
+.return:
+ vzeroupper
+ uncollect_args 4
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+ push rbx
+
+ mov edx, r11d
+ add rdx, byte (SIZEOF_YMMWORD-1)
+ and rdx, -SIZEOF_YMMWORD
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov rax, rdx ; colctr
+.columnloop:
+
+ cmp rax, byte SIZEOF_YMMWORD
+ ja short .above_16
+
+ vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ vpunpckhbw xmm1, xmm0, xmm0
+ vpunpcklbw xmm0, xmm0, xmm0
+
+ vmovdqu XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+ vmovdqu XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+ vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+ jmp near .nextrow
+
+.above_16:
+ vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+ vpermq ymm0, ymm0, 0xd8
+ vpunpckhbw ymm1, ymm0, ymm0
+ vpunpcklbw ymm0, ymm0, ymm0
+
+ vmovdqu YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
+ vmovdqu YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+ sub rax, byte 2*SIZEOF_YMMWORD
+ jz short .nextrow
+
+ add rsi, byte SIZEOF_YMMWORD ; inptr
+ add rbx, 2*SIZEOF_YMMWORD ; outptr0
+ add rdi, 2*SIZEOF_YMMWORD ; outptr1
+ jmp short .columnloop
+
+.nextrow:
+ pop rsi
+ pop rdi
+
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub rcx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ pop rbx
+ vzeroupper
+ uncollect_args 4
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jdsample-sse2.asm b/media/libjpeg/simd/x86_64/jdsample-sse2.asm
new file mode 100644
index 0000000000..38dbceec26
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdsample-sse2.asm
@@ -0,0 +1,665 @@
+;
+; jdsample.asm - upsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE times 8 dw 1
+PW_TWO times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+
+ mov eax, r11d ; colctr
+ test rax, rax
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rax ; colctr
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
+
+ test rax, SIZEOF_XMMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ pxor xmm0, xmm0 ; xmm0=(all 0's)
+ pcmpeqb xmm7, xmm7
+ psrldq xmm7, (SIZEOF_XMMWORD-1)
+ pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+ add rax, byte SIZEOF_XMMWORD-1
+ and rax, byte -SIZEOF_XMMWORD
+ cmp rax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+
+.columnloop_last:
+ pcmpeqb xmm6, xmm6
+ pslldq xmm6, (SIZEOF_XMMWORD-1)
+ pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ jmp short .upsample
+
+.columnloop:
+ movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ pslldq xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+ movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
+ pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
+ psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
+
+ por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
+ por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
+
+ movdqa xmm7, xmm1
+ psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
+
+ movdqa xmm4, xmm1
+ punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm2
+ punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
+ punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
+ movdqa xmm6, xmm3
+ punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
+ punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
+
+ pmullw xmm1, [rel PW_THREE]
+ pmullw xmm4, [rel PW_THREE]
+ paddw xmm2, [rel PW_ONE]
+ paddw xmm5, [rel PW_ONE]
+ paddw xmm3, [rel PW_TWO]
+ paddw xmm6, [rel PW_TWO]
+
+ paddw xmm2, xmm1
+ paddw xmm5, xmm4
+ psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+ paddw xmm3, xmm1
+ paddw xmm6, xmm4
+ psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm3, BYTE_BIT
+ psllw xmm6, BYTE_BIT
+ por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
+ por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+ sub rax, byte SIZEOF_XMMWORD
+ add rsi, byte 1*SIZEOF_XMMWORD ; inptr
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ cmp rax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop rsi
+ pop rdi
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rcx ; rowctr
+ jg near .rowloop
+
+.return:
+ uncollect_args 4
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 4
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 4
+ push rbx
+
+ mov eax, r11d ; colctr
+ test rax, rax
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rax ; colctr
+ push rcx
+ push rdi
+ push rsi
+
+ mov rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test rax, SIZEOF_XMMWORD-1
+ jz short .skip
+ push rdx
+ mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop rdx
+.skip:
+ ; -- process the first column block
+
+ movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
+ movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
+ movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
+
+ pxor xmm3, xmm3 ; xmm3=(all 0's)
+ movdqa xmm4, xmm0
+ punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0, [rel PW_THREE]
+ pmullw xmm4, [rel PW_THREE]
+
+ pcmpeqb xmm7, xmm7
+ psrldq xmm7, (SIZEOF_XMMWORD-2)
+
+ paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+ pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
+ pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
+
+ movdqa XMMWORD [wk(0)], xmm1
+ movdqa XMMWORD [wk(1)], xmm2
+
+ add rax, byte SIZEOF_XMMWORD-1
+ and rax, byte -SIZEOF_XMMWORD
+ cmp rax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+
+.columnloop_last:
+ ; -- process the last column block
+
+ pcmpeqb xmm1, xmm1
+ pslldq xmm1, (SIZEOF_XMMWORD-2)
+ movdqa xmm2, xmm1
+
+ pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+ pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+ movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
+ movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
+
+ jmp near .upsample
+
+.columnloop:
+ ; -- process the next column block
+
+ movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
+ movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
+ movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
+
+ pxor xmm3, xmm3 ; xmm3=(all 0's)
+ movdqa xmm4, xmm0
+ punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0, [rel PW_THREE]
+ pmullw xmm4, [rel PW_THREE]
+
+ paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+ pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
+ pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
+
+ movdqa XMMWORD [wk(2)], xmm1
+ movdqa XMMWORD [wk(3)], xmm2
+
+.upsample:
+ ; -- process the upper row
+
+ movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+ movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
+ pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm3
+ psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
+ pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
+
+ por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
+ por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1, xmm7
+ movdqa xmm2, xmm3
+ pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
+ movdqa xmm4, xmm3
+ psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(0)], xmm4
+
+ pmullw xmm7, [rel PW_THREE]
+ pmullw xmm3, [rel PW_THREE]
+ paddw xmm1, [rel PW_EIGHT]
+ paddw xmm5, [rel PW_EIGHT]
+ paddw xmm0, [rel PW_SEVEN]
+ paddw xmm2, [rel PW_SEVEN]
+
+ paddw xmm1, xmm7
+ paddw xmm5, xmm3
+ psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+ paddw xmm0, xmm7
+ paddw xmm2, xmm3
+ psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm0, BYTE_BIT
+ psllw xmm2, BYTE_BIT
+ por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
+ por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+ ; -- process the lower row
+
+ movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+ movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
+ pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
+ movdqa xmm0, xmm6
+ movdqa xmm2, xmm4
+ psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
+ pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
+
+ por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
+ por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm4
+ pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
+ movdqa xmm3, xmm4
+ psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(1)], xmm3
+
+ pmullw xmm6, [rel PW_THREE]
+ pmullw xmm4, [rel PW_THREE]
+ paddw xmm1, [rel PW_EIGHT]
+ paddw xmm0, [rel PW_EIGHT]
+ paddw xmm7, [rel PW_SEVEN]
+ paddw xmm5, [rel PW_SEVEN]
+
+ paddw xmm1, xmm6
+ paddw xmm0, xmm4
+ psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+ paddw xmm7, xmm6
+ paddw xmm5, xmm4
+ psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm7, BYTE_BIT
+ psllw xmm5, BYTE_BIT
+ por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
+ por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+ sub rax, byte SIZEOF_XMMWORD
+ add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
+ add rbx, byte 1*SIZEOF_XMMWORD ; inptr0
+ add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
+ add rdx, byte 2*SIZEOF_XMMWORD ; outptr0
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
+ cmp rax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test rax, rax
+ jnz near .columnloop_last
+
+ pop rsi
+ pop rdi
+ pop rcx
+ pop rax
+
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub rcx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ pop rbx
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+
+ mov edx, r11d
+ add rdx, byte (2*SIZEOF_XMMWORD)-1
+ and rdx, byte -(2*SIZEOF_XMMWORD)
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz short .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
+ mov rax, rdx ; colctr
+.columnloop:
+
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm3, xmm3
+
+ movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr
+ add rdi, byte 4*SIZEOF_XMMWORD ; outptr
+ jmp short .columnloop
+
+.nextrow:
+ pop rsi
+ pop rdi
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rcx ; rowctr
+ jg short .rowloop
+
+.return:
+ uncollect_args 4
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+ push rbx
+
+ mov edx, r11d
+ add rdx, byte (2*SIZEOF_XMMWORD)-1
+ and rdx, byte -(2*SIZEOF_XMMWORD)
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdip, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rdi
+ push rsi
+
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov rax, rdx ; colctr
+.columnloop:
+
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+
+ movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm3, xmm3
+
+ movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+ movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr
+ add rbx, byte 4*SIZEOF_XMMWORD ; outptr0
+ add rdi, byte 4*SIZEOF_XMMWORD ; outptr1
+ jmp short .columnloop
+
+.nextrow:
+ pop rsi
+ pop rdi
+
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub rcx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ pop rbx
+ uncollect_args 4
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jfdctflt-sse.asm b/media/libjpeg/simd/x86_64/jfdctflt-sse.asm
new file mode 100644
index 0000000000..ef2796649b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctflt-sse.asm
@@ -0,0 +1,355 @@
+;
+; jfdctflt.asm - floating-point FDCT (64-bit SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+; r10 = FAST_FLOAT *data
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 1
+
+ ; ---- Pass 1: process rows.
+
+ mov rdx, r10 ; (FAST_FLOAT *)
+ mov rcx, DCTSIZE/4
+.rowloop:
+
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+ ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+ movaps xmm4, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
+ unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
+ movaps xmm5, xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
+ unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+ ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
+
+ movaps xmm4, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
+ movaps xmm2, xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
+ unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
+
+ movaps xmm7, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
+ unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
+ movaps xmm3, xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
+ unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
+
+ movaps xmm0, xmm7
+ movaps xmm5, xmm6
+ subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
+ unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
+ movaps xmm6, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
+ unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
+
+ movaps xmm2, xmm7
+ movaps xmm3, xmm4
+ addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1, xmm5
+ movaps xmm6, xmm0
+ subps xmm5, xmm7 ; xmm5=tmp13
+ subps xmm0, xmm4 ; xmm0=tmp12
+ addps xmm1, xmm7 ; xmm1=tmp10
+ addps xmm6, xmm4 ; xmm6=tmp11
+
+ addps xmm0, xmm5
+ mulps xmm0, [rel PD_0_707] ; xmm0=z1
+
+ movaps xmm7, xmm1
+ movaps xmm4, xmm5
+ subps xmm1, xmm6 ; xmm1=data4
+ subps xmm5, xmm0 ; xmm5=data6
+ addps xmm7, xmm6 ; xmm7=data0
+ addps xmm4, xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2, xmm3 ; xmm2=tmp10
+ addps xmm3, xmm6 ; xmm3=tmp11
+ addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3, [rel PD_0_707] ; xmm3=z3
+
+ movaps xmm1, xmm2 ; xmm1=tmp10
+ subps xmm2, xmm6
+ mulps xmm2, [rel PD_0_382] ; xmm2=z5
+ mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1, xmm2 ; xmm1=z2
+ addps xmm6, xmm2 ; xmm6=z4
+
+ movaps xmm5, xmm0
+ subps xmm0, xmm3 ; xmm0=z13
+ addps xmm5, xmm3 ; xmm5=z11
+
+ movaps xmm7, xmm0
+ movaps xmm4, xmm5
+ subps xmm0, xmm1 ; xmm0=data3
+ subps xmm5, xmm6 ; xmm5=data7
+ addps xmm7, xmm1 ; xmm7=data5
+ addps xmm4, xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec rcx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov rdx, r10 ; (FAST_FLOAT *)
+ mov rcx, DCTSIZE/4
+.columnloop:
+
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+ ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+ movaps xmm4, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
+ unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
+ movaps xmm5, xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
+ unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+ ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
+
+ movaps xmm4, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
+ unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
+ movaps xmm2, xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
+ unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
+
+ movaps xmm7, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
+ unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
+ movaps xmm3, xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
+ unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
+
+ movaps xmm0, xmm7
+ movaps xmm5, xmm6
+ subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
+ unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
+ movaps xmm6, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
+ unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
+
+ movaps xmm2, xmm7
+ movaps xmm3, xmm4
+ addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1, xmm5
+ movaps xmm6, xmm0
+ subps xmm5, xmm7 ; xmm5=tmp13
+ subps xmm0, xmm4 ; xmm0=tmp12
+ addps xmm1, xmm7 ; xmm1=tmp10
+ addps xmm6, xmm4 ; xmm6=tmp11
+
+ addps xmm0, xmm5
+ mulps xmm0, [rel PD_0_707] ; xmm0=z1
+
+ movaps xmm7, xmm1
+ movaps xmm4, xmm5
+ subps xmm1, xmm6 ; xmm1=data4
+ subps xmm5, xmm0 ; xmm5=data6
+ addps xmm7, xmm6 ; xmm7=data0
+ addps xmm4, xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2, xmm3 ; xmm2=tmp10
+ addps xmm3, xmm6 ; xmm3=tmp11
+ addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3, [rel PD_0_707] ; xmm3=z3
+
+ movaps xmm1, xmm2 ; xmm1=tmp10
+ subps xmm2, xmm6
+ mulps xmm2, [rel PD_0_382] ; xmm2=z5
+ mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1, xmm2 ; xmm1=z2
+ addps xmm6, xmm2 ; xmm6=z4
+
+ movaps xmm5, xmm0
+ subps xmm0, xmm3 ; xmm0=z13
+ addps xmm5, xmm3 ; xmm5=z11
+
+ movaps xmm7, xmm0
+ movaps xmm4, xmm5
+ subps xmm0, xmm1 ; xmm0=data3
+ subps xmm5, xmm6 ; xmm5=data7
+ addps xmm7, xmm1 ; xmm7=data5
+ addps xmm4, xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add rdx, byte 4*SIZEOF_FAST_FLOAT
+ dec rcx
+ jnz near .columnloop
+
+ uncollect_args 1
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm b/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm
new file mode 100644
index 0000000000..2e1bfe6e8c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm
@@ -0,0 +1,389 @@
+;
+; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ 98 ; FIX(0.382683433)
+F_0_541 equ 139 ; FIX(0.541196100)
+F_0_707 equ 181 ; FIX(0.707106781)
+F_1_306 equ 334 ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 1
+
+ ; ---- Pass 1: process rows.
+
+ mov rdx, r10 ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6, xmm1
+ movdqa xmm3, xmm0
+ psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, xmm7
+ paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm0, xmm6
+ psubw xmm3, xmm1 ; xmm3=tmp13
+ psubw xmm6, xmm7 ; xmm6=tmp12
+ paddw xmm4, xmm1 ; xmm4=tmp10
+ paddw xmm0, xmm7 ; xmm0=tmp11
+
+ paddw xmm6, xmm3
+ psllw xmm6, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm6, [rel PW_F0707] ; xmm6=z1
+
+ movdqa xmm1, xmm4
+ movdqa xmm7, xmm3
+ psubw xmm4, xmm0 ; xmm4=data4
+ psubw xmm3, xmm6 ; xmm3=data6
+ paddw xmm1, xmm0 ; xmm1=data0
+ paddw xmm7, xmm6 ; xmm7=data2
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
+
+ ; -- Odd part
+
+ paddw xmm2, xmm5 ; xmm2=tmp10
+ paddw xmm5, xmm0 ; xmm5=tmp11
+ paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
+
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [rel PW_F0707] ; xmm5=z3
+
+ movdqa xmm4, xmm2 ; xmm4=tmp10
+ psubw xmm2, xmm0
+ pmulhw xmm2, [rel PW_F0382] ; xmm2=z5
+ pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4, xmm2 ; xmm4=z2
+ paddw xmm0, xmm2 ; xmm0=z4
+
+ movdqa xmm3, xmm6
+ psubw xmm6, xmm5 ; xmm6=z13
+ paddw xmm3, xmm5 ; xmm3=z11
+
+ movdqa xmm2, xmm6
+ movdqa xmm5, xmm3
+ psubw xmm6, xmm4 ; xmm6=data3
+ psubw xmm3, xmm0 ; xmm3=data7
+ paddw xmm2, xmm4 ; xmm2=data5
+ paddw xmm5, xmm0 ; xmm5=data1
+
+ ; ---- Pass 2: process columns.
+
+ ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+ ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
+
+ ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+ ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
+ movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
+ punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
+ movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm5, xmm6
+ movdqa xmm3, xmm1
+ psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
+ psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
+ paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
+ paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
+
+ movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm7, xmm6
+ movdqa xmm0, xmm2
+ paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
+ paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
+ psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
+ psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm1, xmm5
+ psubw xmm3, xmm6 ; xmm3=tmp13
+ psubw xmm5, xmm2 ; xmm5=tmp12
+ paddw xmm4, xmm6 ; xmm4=tmp10
+ paddw xmm1, xmm2 ; xmm1=tmp11
+
+ paddw xmm5, xmm3
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [rel PW_F0707] ; xmm5=z1
+
+ movdqa xmm6, xmm4
+ movdqa xmm2, xmm3
+ psubw xmm4, xmm1 ; xmm4=data4
+ psubw xmm3, xmm5 ; xmm3=data6
+ paddw xmm6, xmm1 ; xmm6=data0
+ paddw xmm2, xmm5 ; xmm2=data2
+
+ movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+ ; -- Odd part
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ paddw xmm7, xmm0 ; xmm7=tmp10
+ paddw xmm0, xmm1 ; xmm0=tmp11
+ paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
+
+ psllw xmm7, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm1, PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm0, [rel PW_F0707] ; xmm0=z3
+
+ movdqa xmm4, xmm7 ; xmm4=tmp10
+ psubw xmm7, xmm1
+ pmulhw xmm7, [rel PW_F0382] ; xmm7=z5
+ pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4, xmm7 ; xmm4=z2
+ paddw xmm1, xmm7 ; xmm1=z4
+
+ movdqa xmm3, xmm5
+ psubw xmm5, xmm0 ; xmm5=z13
+ paddw xmm3, xmm0 ; xmm3=z11
+
+ movdqa xmm6, xmm5
+ movdqa xmm2, xmm3
+ psubw xmm5, xmm4 ; xmm5=data3
+ psubw xmm3, xmm1 ; xmm3=data7
+ paddw xmm6, xmm4 ; xmm6=data5
+ paddw xmm2, xmm1 ; xmm2=data1
+
+ movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+ movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+ uncollect_args 1
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jfdctint-avx2.asm b/media/libjpeg/simd/x86_64/jfdctint-avx2.asm
new file mode 100644
index 0000000000..e56258b48a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctint-avx2.asm
@@ -0,0 +1,320 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+ ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
+ ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
+ ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
+ ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
+
+ vpunpcklwd %5, %1, %2
+ vpunpckhwd %6, %1, %2
+ vpunpcklwd %7, %3, %4
+ vpunpckhwd %8, %3, %4
+ ; transpose coefficients(phase 1)
+ ; %5=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53)
+ ; %6=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57)
+ ; %7=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73)
+ ; %8=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77)
+
+ vpunpckldq %1, %5, %7
+ vpunpckhdq %2, %5, %7
+ vpunpckldq %3, %6, %8
+ vpunpckhdq %4, %6, %8
+ ; transpose coefficients(phase 2)
+ ; %1=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
+ ; %2=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
+ ; %3=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
+ ; %4=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
+
+ vpermq %1, %1, 0x8D
+ vpermq %2, %2, 0x8D
+ vpermq %3, %3, 0xD8
+ vpermq %4, %4, 0xD8
+ ; transpose coefficients(phase 3)
+ ; %1=(01 11 21 31 41 51 61 71 00 10 20 30 40 50 60 70)
+ ; %2=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
+ ; %3=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
+ ; %4=(06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9: Pass (1 or 2)
+
+%macro dodct 9
+ vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7
+ vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0
+ vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2
+ vpsubw %8, %2, %3 ; %8=data3_2-data4_5=tmp4_5
+
+ ; -- Even part
+
+ vperm2i128 %6, %6, %6, 0x01 ; %6=tmp0_1
+ vpaddw %1, %6, %7 ; %1=tmp0_1+tmp3_2=tmp10_11
+ vpsubw %6, %6, %7 ; %6=tmp0_1-tmp3_2=tmp13_12
+
+ vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10
+ vpsignw %1, %1, [rel PW_1_NEG1] ; %1=tmp10_neg11
+ vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+ vpsllw %1, %7, PASS1_BITS ; %1=data0_4
+%else
+ vpaddw %7, %7, [rel PW_DESCALE_P2X]
+ vpsraw %1, %7, PASS1_BITS ; %1=data0_4
+%endif
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ vperm2i128 %7, %6, %6, 0x01 ; %7=tmp12_13
+ vpunpcklwd %2, %6, %7
+ vpunpckhwd %6, %6, %7
+ vpmaddwd %2, %2, [rel PW_F130_F054_MF130_F054] ; %2=data2_6L
+ vpmaddwd %6, %6, [rel PW_F130_F054_MF130_F054] ; %6=data2_6H
+
+ vpaddd %2, %2, [rel PD_DESCALE_P %+ %9]
+ vpaddd %6, %6, [rel PD_DESCALE_P %+ %9]
+ vpsrad %2, %2, DESCALE_P %+ %9
+ vpsrad %6, %6, DESCALE_P %+ %9
+
+ vpackssdw %3, %2, %6 ; %6=data2_6
+
+ ; -- Odd part
+
+ vpaddw %7, %8, %5 ; %7=tmp4_5+tmp6_7=z3_4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ vperm2i128 %2, %7, %7, 0x01 ; %2=z4_3
+ vpunpcklwd %6, %7, %2
+ vpunpckhwd %7, %7, %2
+ vpmaddwd %6, %6, [rel PW_MF078_F117_F078_F117] ; %6=z3_4L
+ vpmaddwd %7, %7, [rel PW_MF078_F117_F078_F117] ; %7=z3_4H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ vperm2i128 %4, %5, %5, 0x01 ; %4=tmp7_6
+ vpunpcklwd %2, %8, %4
+ vpunpckhwd %4, %8, %4
+ vpmaddwd %2, %2, [rel PW_MF060_MF089_MF050_MF256] ; %2=tmp4_5L
+ vpmaddwd %4, %4, [rel PW_MF060_MF089_MF050_MF256] ; %4=tmp4_5H
+
+ vpaddd %2, %2, %6 ; %2=data7_5L
+ vpaddd %4, %4, %7 ; %4=data7_5H
+
+ vpaddd %2, %2, [rel PD_DESCALE_P %+ %9]
+ vpaddd %4, %4, [rel PD_DESCALE_P %+ %9]
+ vpsrad %2, %2, DESCALE_P %+ %9
+ vpsrad %4, %4, DESCALE_P %+ %9
+
+ vpackssdw %4, %2, %4 ; %4=data7_5
+
+ vperm2i128 %2, %8, %8, 0x01 ; %2=tmp5_4
+ vpunpcklwd %8, %5, %2
+ vpunpckhwd %5, %5, %2
+ vpmaddwd %8, %8, [rel PW_F050_MF256_F060_MF089] ; %8=tmp6_7L
+ vpmaddwd %5, %5, [rel PW_F050_MF256_F060_MF089] ; %5=tmp6_7H
+
+ vpaddd %8, %8, %6 ; %8=data3_1L
+ vpaddd %5, %5, %7 ; %5=data3_1H
+
+ vpaddd %8, %8, [rel PD_DESCALE_P %+ %9]
+ vpaddd %5, %5, [rel PD_DESCALE_P %+ %9]
+ vpsrad %8, %8, DESCALE_P %+ %9
+ vpsrad %5, %5, DESCALE_P %+ %9
+
+ vpackssdw %2, %8, %5 ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+ times 4 dw (F_0_541 - F_1_847), F_0_541
+PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+ times 4 dw (F_1_175 - F_0_390), F_1_175
+PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
+ times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089 times 4 dw (F_3_072 - F_2_562), -F_2_562
+ times 4 dw (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1)
+PW_1_NEG1 times 8 dw 1
+ times 8 dw -1
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 1
+
+ ; ---- Pass 1: process rows.
+
+ vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)]
+ vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)]
+ vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)]
+ vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)]
+ ; ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ ; ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ ; ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ ; ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ vperm2i128 ymm0, ymm4, ymm6, 0x20
+ vperm2i128 ymm1, ymm4, ymm6, 0x31
+ vperm2i128 ymm2, ymm5, ymm7, 0x20
+ vperm2i128 ymm3, ymm5, ymm7, 0x31
+ ; ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
+ ; ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
+ ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
+ ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
+
+ dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+ dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+ ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+ ; ---- Pass 2: process columns.
+
+ vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7
+ vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5
+
+ dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+ dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+ ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+ vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1
+ vperm2i128 ymm5, ymm2, ymm1, 0x20 ; ymm5=data2_3
+ vperm2i128 ymm6, ymm0, ymm4, 0x31 ; ymm6=data4_5
+ vperm2i128 ymm7, ymm2, ymm4, 0x21 ; ymm7=data6_7
+
+ vmovdqu YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm3
+ vmovdqu YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm5
+ vmovdqu YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm6
+ vmovdqu YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7
+
+ vzeroupper
+ uncollect_args 1
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jfdctint-sse2.asm b/media/libjpeg/simd/x86_64/jfdctint-sse2.asm
new file mode 100644
index 0000000000..ec1f383ccb
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctint-sse2.asm
@@ -0,0 +1,619 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 6
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 1
+
+ ; ---- Pass 1: process rows.
+
+ mov rdx, r10 ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6, xmm1
+ movdqa xmm3, xmm0
+ psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, xmm7
+ paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm0, xmm6
+ paddw xmm3, xmm1 ; xmm3=tmp10
+ paddw xmm6, xmm7 ; xmm6=tmp11
+ psubw xmm4, xmm1 ; xmm4=tmp13
+ psubw xmm0, xmm7 ; xmm0=tmp12
+
+ movdqa xmm1, xmm3
+ paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
+ psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
+
+ psllw xmm3, PASS1_BITS ; xmm3=data0
+ psllw xmm1, PASS1_BITS ; xmm1=data4
+
+ movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
+ movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm7, xmm4 ; xmm4=tmp13
+ movdqa xmm6, xmm4
+ punpcklwd xmm7, xmm0 ; xmm0=tmp12
+ punpckhwd xmm6, xmm0
+ movdqa xmm4, xmm7
+ movdqa xmm0, xmm6
+ pmaddwd xmm7, [rel PW_F130_F054] ; xmm7=data2L
+ pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=data2H
+ pmaddwd xmm4, [rel PW_F054_MF130] ; xmm4=data6L
+ pmaddwd xmm0, [rel PW_F054_MF130] ; xmm0=data6H
+
+ paddd xmm7, [rel PD_DESCALE_P1]
+ paddd xmm6, [rel PD_DESCALE_P1]
+ psrad xmm7, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+ paddd xmm4, [rel PD_DESCALE_P1]
+ paddd xmm0, [rel PD_DESCALE_P1]
+ psrad xmm4, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm7, xmm6 ; xmm7=data2
+ packssdw xmm4, xmm0 ; xmm4=data6
+
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
+
+ ; -- Odd part
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
+
+ movdqa xmm6, xmm2 ; xmm2=tmp4
+ movdqa xmm0, xmm5 ; xmm5=tmp5
+ paddw xmm6, xmm3 ; xmm6=z3
+ paddw xmm0, xmm1 ; xmm0=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm7, xmm6
+ movdqa xmm4, xmm6
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm4, xmm0
+ movdqa xmm6, xmm7
+ movdqa xmm0, xmm4
+ pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3L
+ pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3H
+ pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4L
+ pmaddwd xmm0, [rel PW_F117_F078] ; xmm0=z4H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ punpcklwd xmm7, xmm1
+ punpckhwd xmm4, xmm1
+ movdqa xmm2, xmm7
+ movdqa xmm1, xmm4
+ pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp4L
+ pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4H
+ pmaddwd xmm2, [rel PW_MF089_F060] ; xmm2=tmp7L
+ pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp7H
+
+ paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
+ paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
+ paddd xmm2, xmm6 ; xmm2=data1L
+ paddd xmm1, xmm0 ; xmm1=data1H
+
+ paddd xmm7, [rel PD_DESCALE_P1]
+ paddd xmm4, [rel PD_DESCALE_P1]
+ psrad xmm7, DESCALE_P1
+ psrad xmm4, DESCALE_P1
+ paddd xmm2, [rel PD_DESCALE_P1]
+ paddd xmm1, [rel PD_DESCALE_P1]
+ psrad xmm2, DESCALE_P1
+ psrad xmm1, DESCALE_P1
+
+ packssdw xmm7, xmm4 ; xmm7=data7
+ packssdw xmm2, xmm1 ; xmm2=data1
+
+ movdqa xmm4, xmm5
+ movdqa xmm1, xmm5
+ punpcklwd xmm4, xmm3
+ punpckhwd xmm1, xmm3
+ movdqa xmm5, xmm4
+ movdqa xmm3, xmm1
+ pmaddwd xmm4, [rel PW_MF050_MF256] ; xmm4=tmp5L
+ pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5H
+ pmaddwd xmm5, [rel PW_MF256_F050] ; xmm5=tmp6L
+ pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6H
+
+ paddd xmm4, xmm6 ; xmm4=data5L
+ paddd xmm1, xmm0 ; xmm1=data5H
+ paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
+ paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
+
+ paddd xmm4, [rel PD_DESCALE_P1]
+ paddd xmm1, [rel PD_DESCALE_P1]
+ psrad xmm4, DESCALE_P1
+ psrad xmm1, DESCALE_P1
+ paddd xmm5, [rel PD_DESCALE_P1]
+ paddd xmm3, [rel PD_DESCALE_P1]
+ psrad xmm5, DESCALE_P1
+ psrad xmm3, DESCALE_P1
+
+ packssdw xmm4, xmm1 ; xmm4=data5
+ packssdw xmm5, xmm3 ; xmm5=data3
+
+ ; ---- Pass 2: process columns.
+
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
+ movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
+
+ ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+ ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
+ movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
+
+ ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+ ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
+ movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm2, xmm5
+ movdqa xmm7, xmm6
+ psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
+ psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
+ paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
+
+ movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm0, xmm5
+ movdqa xmm3, xmm4
+ paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
+ paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
+ psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm1, xmm7
+ movdqa xmm6, xmm2
+ paddw xmm7, xmm5 ; xmm7=tmp10
+ paddw xmm2, xmm4 ; xmm2=tmp11
+ psubw xmm1, xmm5 ; xmm1=tmp13
+ psubw xmm6, xmm4 ; xmm6=tmp12
+
+ movdqa xmm5, xmm7
+ paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
+ psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
+
+ paddw xmm7, [rel PW_DESCALE_P2X]
+ paddw xmm5, [rel PW_DESCALE_P2X]
+ psraw xmm7, PASS1_BITS ; xmm7=data0
+ psraw xmm5, PASS1_BITS ; xmm5=data4
+
+ movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+ movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm4, xmm1 ; xmm1=tmp13
+ movdqa xmm2, xmm1
+ punpcklwd xmm4, xmm6 ; xmm6=tmp12
+ punpckhwd xmm2, xmm6
+ movdqa xmm1, xmm4
+ movdqa xmm6, xmm2
+ pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=data2L
+ pmaddwd xmm2, [rel PW_F130_F054] ; xmm2=data2H
+ pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=data6L
+ pmaddwd xmm6, [rel PW_F054_MF130] ; xmm6=data6H
+
+ paddd xmm4, [rel PD_DESCALE_P2]
+ paddd xmm2, [rel PD_DESCALE_P2]
+ psrad xmm4, DESCALE_P2
+ psrad xmm2, DESCALE_P2
+ paddd xmm1, [rel PD_DESCALE_P2]
+ paddd xmm6, [rel PD_DESCALE_P2]
+ psrad xmm1, DESCALE_P2
+ psrad xmm6, DESCALE_P2
+
+ packssdw xmm4, xmm2 ; xmm4=data2
+ packssdw xmm1, xmm6 ; xmm1=data6
+
+ movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+ ; -- Odd part
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ movdqa xmm2, xmm0 ; xmm0=tmp4
+ movdqa xmm6, xmm3 ; xmm3=tmp5
+ paddw xmm2, xmm7 ; xmm2=z3
+ paddw xmm6, xmm5 ; xmm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm4, xmm2
+ movdqa xmm1, xmm2
+ punpcklwd xmm4, xmm6
+ punpckhwd xmm1, xmm6
+ movdqa xmm2, xmm4
+ movdqa xmm6, xmm1
+ pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3L
+ pmaddwd xmm1, [rel PW_MF078_F117] ; xmm1=z3H
+ pmaddwd xmm2, [rel PW_F117_F078] ; xmm2=z4L
+ pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm4, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm4, xmm5
+ punpckhwd xmm1, xmm5
+ movdqa xmm0, xmm4
+ movdqa xmm5, xmm1
+ pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4L
+ pmaddwd xmm1, [rel PW_MF060_MF089] ; xmm1=tmp4H
+ pmaddwd xmm0, [rel PW_MF089_F060] ; xmm0=tmp7L
+ pmaddwd xmm5, [rel PW_MF089_F060] ; xmm5=tmp7H
+
+ paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
+ paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
+ paddd xmm0, xmm2 ; xmm0=data1L
+ paddd xmm5, xmm6 ; xmm5=data1H
+
+ paddd xmm4, [rel PD_DESCALE_P2]
+ paddd xmm1, [rel PD_DESCALE_P2]
+ psrad xmm4, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm0, [rel PD_DESCALE_P2]
+ paddd xmm5, [rel PD_DESCALE_P2]
+ psrad xmm0, DESCALE_P2
+ psrad xmm5, DESCALE_P2
+
+ packssdw xmm4, xmm1 ; xmm4=data7
+ packssdw xmm0, xmm5 ; xmm0=data1
+
+ movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+ movdqa xmm1, xmm3
+ movdqa xmm5, xmm3
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm5, xmm7
+ movdqa xmm3, xmm1
+ movdqa xmm7, xmm5
+ pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5L
+ pmaddwd xmm5, [rel PW_MF050_MF256] ; xmm5=tmp5H
+ pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6L
+ pmaddwd xmm7, [rel PW_MF256_F050] ; xmm7=tmp6H
+
+ paddd xmm1, xmm2 ; xmm1=data5L
+ paddd xmm5, xmm6 ; xmm5=data5H
+ paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
+ paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
+
+ paddd xmm1, [rel PD_DESCALE_P2]
+ paddd xmm5, [rel PD_DESCALE_P2]
+ psrad xmm1, DESCALE_P2
+ psrad xmm5, DESCALE_P2
+ paddd xmm3, [rel PD_DESCALE_P2]
+ paddd xmm7, [rel PD_DESCALE_P2]
+ psrad xmm3, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm1, xmm5 ; xmm1=data5
+ packssdw xmm3, xmm7 ; xmm3=data3
+
+ movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+ uncollect_args 1
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jidctflt-sse2.asm b/media/libjpeg/simd/x86_64/jidctflt-sse2.asm
new file mode 100644
index 0000000000..60bf961896
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctflt-sse2.asm
@@ -0,0 +1,482 @@
+;
+; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414 times 4 dd 1.414213562373095048801689
+PD_1_847 times 4 dd 1.847759065022573512256366
+PD_1_082 times 4 dd 1.082392200292393968799446
+PD_M2_613 times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp rbp + 0
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+ ; FAST_FLOAT workspace[DCTSIZE2]
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [workspace]
+ collect_args 4
+ push rbx
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+ lea rdi, [workspace] ; FAST_FLOAT *wsptr
+ mov rcx, DCTSIZE/4 ; ctr
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, xmm2
+ por xmm3, xmm4
+ por xmm5, xmm6
+ por xmm1, xmm3
+ por xmm5, xmm7
+ por xmm1, xmm5
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, xmm0
+
+ shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
+ shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
+ shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
+ shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
+
+ movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ jmp near .nextcolumn
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
+ psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
+ cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
+
+ punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
+ punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
+ psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
+ psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
+ cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
+ cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [rel PD_1_414]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
+ punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
+ psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
+ psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
+ cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
+ cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
+
+ punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
+ punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
+ psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
+ psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
+ cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
+ cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
+
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [rel PD_1_847] ; xmm0=z5
+ mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
+ addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
+ subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
+ subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
+ movaps xmm3, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
+ unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
+
+ movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
+ movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm0, xmm7
+ movaps xmm3, xmm5
+ addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
+ addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
+ subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
+ subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
+
+ movaps xmm2, xmm7 ; transpose coefficients(phase 1)
+ unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
+ unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
+ movaps xmm4, xmm5 ; transpose coefficients(phase 1)
+ unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
+ unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
+
+ movaps xmm3, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
+ unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
+ movaps xmm0, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
+ unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
+ movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+ movaps xmm6, xmm5 ; transpose coefficients(phase 2)
+ unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
+ unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
+ movaps xmm3, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
+ unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+ add rsi, byte 4*SIZEOF_JCOEF ; coef_block
+ add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec rcx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov rax, [original_rbp]
+ lea rsi, [workspace] ; FAST_FLOAT *wsptr
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+ mov rcx, DCTSIZE/4 ; ctr
+.rowloop:
+
+ ; -- Even part
+
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [rel PD_1_414]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [rel PD_1_847] ; xmm0=z5
+ mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
+ addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
+ subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
+ subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
+ pcmpeqd xmm3, xmm3
+ psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+ addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+ addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+ addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+ pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
+ pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+ pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
+ pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+ por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
+ por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
+
+ movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm7, xmm1
+ movaps xmm5, xmm3
+ addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
+ addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
+ subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
+ subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
+
+ movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
+ pcmpeqd xmm4, xmm4
+ psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+ addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+ addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+ addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+ pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
+ pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+ pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
+ pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+ por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
+ por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
+
+ packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+ packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+ paddb xmm6, xmm2
+ paddb xmm1, xmm2
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
+ punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+ pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+ mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+ add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
+ add rdi, byte 4*SIZEOF_JSAMPROW
+ dec rcx ; ctr
+ jnz near .rowloop
+
+ pop rbx
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jidctfst-sse2.asm b/media/libjpeg/simd/x86_64/jidctfst-sse2.asm
new file mode 100644
index 0000000000..cb97fdfbb2
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctfst-sse2.asm
@@ -0,0 +1,491 @@
+;
+; jidctfst.asm - fast integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+%define PASS1_BITS 2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277 ; FIX(1.082392200)
+F_1_414 equ 362 ; FIX(1.414213562)
+F_1_847 equ 473 ; FIX(1.847759065)
+F_2_613 equ 669 ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
+PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
+PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp rbp + 0
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 4
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, xmm0
+ packsswb xmm1, xmm1
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
+ jmp near .column_end
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ psubw xmm0, xmm2 ; xmm0=tmp11
+ psubw xmm1, xmm3
+ paddw xmm4, xmm2 ; xmm4=tmp10
+ paddw xmm5, xmm3 ; xmm5=tmp13
+
+ psllw xmm1, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm1, [rel PW_F1414]
+ psubw xmm1, xmm5 ; xmm1=tmp12
+
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm0
+ psubw xmm4, xmm5 ; xmm4=tmp3
+ psubw xmm0, xmm1 ; xmm0=tmp2
+ paddw xmm6, xmm5 ; xmm6=tmp0
+ paddw xmm7, xmm1 ; xmm7=tmp1
+
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4, xmm2
+ movdqa xmm0, xmm5
+ psubw xmm2, xmm1 ; xmm2=z12
+ psubw xmm5, xmm3 ; xmm5=z10
+ paddw xmm4, xmm1 ; xmm4=z11
+ paddw xmm0, xmm3 ; xmm0=z13
+
+ movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm3, xmm4
+ psubw xmm4, xmm0
+ paddw xmm3, xmm0 ; xmm3=tmp7
+
+ psllw xmm4, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm0, xmm5
+ paddw xmm5, xmm2
+ pmulhw xmm5, [rel PW_F1847] ; xmm5=z5
+ pmulhw xmm0, [rel PW_MF1613]
+ pmulhw xmm2, [rel PW_F1082]
+ psubw xmm0, xmm1
+ psubw xmm2, xmm5 ; xmm2=tmp10
+ paddw xmm0, xmm5 ; xmm0=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm0, xmm3 ; xmm0=tmp6
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm7
+ paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
+ paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
+ psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
+ psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
+ psubw xmm4, xmm0 ; xmm4=tmp5
+
+ movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
+ movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
+
+ paddw xmm2, xmm4 ; xmm2=tmp4
+ movdqa xmm5, xmm7
+ movdqa xmm0, xmm1
+ paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
+ paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
+ psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+ psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
+ punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
+
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
+ movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
+ punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
+ punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov rax, [original_rbp]
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+
+ ; -- Even part
+
+ ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+ movdqa xmm2, xmm6
+ movdqa xmm0, xmm5
+ psubw xmm6, xmm1 ; xmm6=tmp11
+ psubw xmm5, xmm3
+ paddw xmm2, xmm1 ; xmm2=tmp10
+ paddw xmm0, xmm3 ; xmm0=tmp13
+
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [rel PW_F1414]
+ psubw xmm5, xmm0 ; xmm5=tmp12
+
+ movdqa xmm1, xmm2
+ movdqa xmm3, xmm6
+ psubw xmm2, xmm0 ; xmm2=tmp3
+ psubw xmm6, xmm5 ; xmm6=tmp2
+ paddw xmm1, xmm0 ; xmm1=tmp0
+ paddw xmm3, xmm5 ; xmm3=tmp1
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
+
+ ; -- Odd part
+
+ ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+ movdqa xmm2, xmm0
+ movdqa xmm6, xmm4
+ psubw xmm0, xmm7 ; xmm0=z12
+ psubw xmm4, xmm5 ; xmm4=z10
+ paddw xmm2, xmm7 ; xmm2=z11
+ paddw xmm6, xmm5 ; xmm6=z13
+
+ movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm4, PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm5, xmm2
+ psubw xmm2, xmm6
+ paddw xmm5, xmm6 ; xmm5=tmp7
+
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm6, xmm4
+ paddw xmm4, xmm0
+ pmulhw xmm4, [rel PW_F1847] ; xmm4=z5
+ pmulhw xmm6, [rel PW_MF1613]
+ pmulhw xmm0, [rel PW_F1082]
+ psubw xmm6, xmm7
+ psubw xmm0, xmm4 ; xmm0=tmp10
+ paddw xmm6, xmm4 ; xmm6=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm6, xmm5 ; xmm6=tmp6
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm3
+ paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
+ paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ psraw xmm1, (PASS1_BITS+3) ; descale
+ psraw xmm3, (PASS1_BITS+3) ; descale
+ psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
+ psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+ psraw xmm7, (PASS1_BITS+3) ; descale
+ psraw xmm4, (PASS1_BITS+3) ; descale
+ psubw xmm2, xmm6 ; xmm2=tmp5
+
+ packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
+
+ paddw xmm0, xmm2 ; xmm0=tmp4
+ movdqa xmm4, xmm5
+ movdqa xmm7, xmm6
+ paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
+ paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
+ psraw xmm5, (PASS1_BITS+3) ; descale
+ psraw xmm6, (PASS1_BITS+3) ; descale
+ psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+ psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
+ psraw xmm4, (PASS1_BITS+3) ; descale
+ psraw xmm7, (PASS1_BITS+3) ; descale
+
+ movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
+
+ packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm1, xmm2
+ paddb xmm3, xmm2
+ paddb xmm5, xmm2
+ paddb xmm7, xmm2
+
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
+ punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
+ punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
+ punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
+ punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+ mov rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+ mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+ mov rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jidctint-avx2.asm b/media/libjpeg/simd/x86_64/jidctint-avx2.asm
new file mode 100644
index 0000000000..ca7e317f6e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctint-avx2.asm
@@ -0,0 +1,418 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+ ; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71)
+ ; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
+ ; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
+ ; %8=(07 17 27 37 47 57 67 77 06 16 26 36 46 56 66 76)
+
+ vpermq %5, %1, 0xD8
+ vpermq %6, %2, 0x72
+ vpermq %7, %3, 0xD8
+ vpermq %8, %4, 0x72
+ ; transpose coefficients(phase 1)
+ ; %5=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
+ ; %6=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
+ ; %7=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
+ ; %8=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
+
+ vpunpcklwd %1, %5, %6
+ vpunpckhwd %2, %5, %6
+ vpunpcklwd %3, %7, %8
+ vpunpckhwd %4, %7, %8
+ ; transpose coefficients(phase 2)
+ ; %1=(00 02 10 12 20 22 30 32 40 42 50 52 60 62 70 72)
+ ; %2=(01 03 11 13 21 23 31 33 41 43 51 53 61 63 71 73)
+ ; %3=(04 06 14 16 24 26 34 36 44 46 54 56 64 66 74 76)
+ ; %4=(05 07 15 17 25 27 35 37 45 47 55 57 65 67 75 77)
+
+ vpunpcklwd %5, %1, %2
+ vpunpcklwd %6, %3, %4
+ vpunpckhwd %7, %1, %2
+ vpunpckhwd %8, %3, %4
+ ; transpose coefficients(phase 3)
+ ; %5=(00 01 02 03 10 11 12 13 40 41 42 43 50 51 52 53)
+ ; %6=(04 05 06 07 14 15 16 17 44 45 46 47 54 55 56 57)
+ ; %7=(20 21 22 23 30 31 32 33 60 61 62 63 70 71 72 73)
+ ; %8=(24 25 26 27 34 35 36 37 64 65 66 67 74 75 76 77)
+
+ vpunpcklqdq %1, %5, %6
+ vpunpckhqdq %2, %5, %6
+ vpunpcklqdq %3, %7, %8
+ vpunpckhqdq %4, %7, %8
+ ; transpose coefficients(phase 4)
+ ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
+ ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
+ ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
+ ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%12: Temp registers
+; %9: Pass (1 or 2)
+
+%macro dodct 13
+ ; -- Even part
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ vperm2i128 %6, %3, %3, 0x01 ; %6=in6_2
+ vpunpcklwd %5, %3, %6 ; %5=in26_62L
+ vpunpckhwd %6, %3, %6 ; %6=in26_62H
+ vpmaddwd %5, %5, [rel PW_F130_F054_MF130_F054] ; %5=tmp3_2L
+ vpmaddwd %6, %6, [rel PW_F130_F054_MF130_F054] ; %6=tmp3_2H
+
+ vperm2i128 %7, %1, %1, 0x01 ; %7=in4_0
+ vpsignw %1, %1, [rel PW_1_NEG1]
+ vpaddw %7, %7, %1 ; %7=(in0+in4)_(in0-in4)
+
+ vpxor %1, %1, %1
+ vpunpcklwd %8, %1, %7 ; %8=tmp0_1L
+ vpunpckhwd %1, %1, %7 ; %1=tmp0_1H
+ vpsrad %8, %8, (16-CONST_BITS) ; vpsrad %8,16 & vpslld %8,CONST_BITS
+ vpsrad %1, %1, (16-CONST_BITS) ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+ vpsubd %11, %8, %5 ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+ vpaddd %9, %8, %5 ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+ vpsubd %12, %1, %6 ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+ vpaddd %10, %1, %6 ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+ ; -- Odd part
+
+ vpaddw %1, %4, %2 ; %1=in7_5+in3_1=z3_4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ vperm2i128 %8, %1, %1, 0x01 ; %8=z4_3
+ vpunpcklwd %7, %1, %8 ; %7=z34_43L
+ vpunpckhwd %8, %1, %8 ; %8=z34_43H
+ vpmaddwd %7, %7, [rel PW_MF078_F117_F078_F117] ; %7=z3_4L
+ vpmaddwd %8, %8, [rel PW_MF078_F117_F078_F117] ; %8=z3_4H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ vperm2i128 %2, %2, %2, 0x01 ; %2=in1_3
+ vpunpcklwd %3, %4, %2 ; %3=in71_53L
+ vpunpckhwd %4, %4, %2 ; %4=in71_53H
+
+ vpmaddwd %5, %3, [rel PW_MF060_MF089_MF050_MF256] ; %5=tmp0_1L
+ vpmaddwd %6, %4, [rel PW_MF060_MF089_MF050_MF256] ; %6=tmp0_1H
+ vpaddd %5, %5, %7 ; %5=tmp0_1L+z3_4L=tmp0_1L
+ vpaddd %6, %6, %8 ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+ vpmaddwd %3, %3, [rel PW_MF089_F060_MF256_F050] ; %3=tmp3_2L
+ vpmaddwd %4, %4, [rel PW_MF089_F060_MF256_F050] ; %4=tmp3_2H
+ vperm2i128 %7, %7, %7, 0x01 ; %7=z4_3L
+ vperm2i128 %8, %8, %8, 0x01 ; %8=z4_3H
+ vpaddd %7, %3, %7 ; %7=tmp3_2L+z4_3L=tmp3_2L
+ vpaddd %8, %4, %8 ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+ ; -- Final output stage
+
+ vpaddd %1, %9, %7 ; %1=tmp10_11L+tmp3_2L=data0_1L
+ vpaddd %2, %10, %8 ; %2=tmp10_11H+tmp3_2H=data0_1H
+ vpaddd %1, %1, [rel PD_DESCALE_P %+ %13]
+ vpaddd %2, %2, [rel PD_DESCALE_P %+ %13]
+ vpsrad %1, %1, DESCALE_P %+ %13
+ vpsrad %2, %2, DESCALE_P %+ %13
+ vpackssdw %1, %1, %2 ; %1=data0_1
+
+ vpsubd %3, %9, %7 ; %3=tmp10_11L-tmp3_2L=data7_6L
+ vpsubd %4, %10, %8 ; %4=tmp10_11H-tmp3_2H=data7_6H
+ vpaddd %3, %3, [rel PD_DESCALE_P %+ %13]
+ vpaddd %4, %4, [rel PD_DESCALE_P %+ %13]
+ vpsrad %3, %3, DESCALE_P %+ %13
+ vpsrad %4, %4, DESCALE_P %+ %13
+ vpackssdw %4, %3, %4 ; %4=data7_6
+
+ vpaddd %7, %11, %5 ; %7=tmp13_12L+tmp0_1L=data3_2L
+ vpaddd %8, %12, %6 ; %8=tmp13_12H+tmp0_1H=data3_2H
+ vpaddd %7, %7, [rel PD_DESCALE_P %+ %13]
+ vpaddd %8, %8, [rel PD_DESCALE_P %+ %13]
+ vpsrad %7, %7, DESCALE_P %+ %13
+ vpsrad %8, %8, DESCALE_P %+ %13
+ vpackssdw %2, %7, %8 ; %2=data3_2
+
+ vpsubd %7, %11, %5 ; %7=tmp13_12L-tmp0_1L=data4_5L
+ vpsubd %8, %12, %6 ; %8=tmp13_12H-tmp0_1H=data4_5H
+ vpaddd %7, %7, [rel PD_DESCALE_P %+ %13]
+ vpaddd %8, %8, [rel PD_DESCALE_P %+ %13]
+ vpsrad %7, %7, DESCALE_P %+ %13
+ vpsrad %8, %8, DESCALE_P %+ %13
+ vpackssdw %3, %7, %8 ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+ times 4 dw (F_0_541 - F_1_847), F_0_541
+PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+ times 4 dw (F_1_175 - F_0_390), F_1_175
+PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
+ times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050 times 4 dw -F_0_899, (F_1_501 - F_0_899)
+ times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 32 db CENTERJSAMPLE
+PW_1_NEG1 times 8 dw 1
+ times 8 dw -1
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ mov rbp, rsp ; rbp = aligned rbp
+ push_xmm 4
+ collect_args 4
+
+ ; ---- Pass 1: process columns.
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+ mov eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,r11,SIZEOF_JCOEF)]
+ vpor xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,r11,SIZEOF_JCOEF)]
+ vpor xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,r11,SIZEOF_JCOEF)]
+ vpor xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,r11,SIZEOF_JCOEF)]
+ vpor xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,r11,SIZEOF_JCOEF)]
+ vpor xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,r11,SIZEOF_JCOEF)]
+ vpor xmm1, xmm1, xmm0
+ vpacksswb xmm1, xmm1, xmm1
+ vpacksswb xmm1, xmm1, xmm1
+ movd eax, xmm1
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,r11,SIZEOF_JCOEF)]
+ vpmullw xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+ vpsllw xmm5, xmm5, PASS1_BITS
+
+ vpunpcklwd xmm4, xmm5, xmm5 ; xmm4=(00 00 01 01 02 02 03 03)
+ vpunpckhwd xmm5, xmm5, xmm5 ; xmm5=(04 04 05 05 06 06 07 07)
+ vinserti128 ymm4, ymm4, xmm5, 1
+
+ vpshufd ymm0, ymm4, 0x00 ; ymm0=col0_4=(00 00 00 00 00 00 00 00 04 04 04 04 04 04 04 04)
+ vpshufd ymm1, ymm4, 0x55 ; ymm1=col1_5=(01 01 01 01 01 01 01 01 05 05 05 05 05 05 05 05)
+ vpshufd ymm2, ymm4, 0xAA ; ymm2=col2_6=(02 02 02 02 02 02 02 02 06 06 06 06 06 06 06 06)
+ vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07)
+
+ jmp near .column_end
+%endif
+.columnDCT:
+
+ vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,r11,SIZEOF_JCOEF)] ; ymm4=in0_1
+ vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,r11,SIZEOF_JCOEF)] ; ymm5=in2_3
+ vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,r11,SIZEOF_JCOEF)] ; ymm6=in4_5
+ vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,r11,SIZEOF_JCOEF)] ; ymm7=in6_7
+ vpmullw ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+ vpmullw ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+ vpmullw ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+ vpmullw ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+ vperm2i128 ymm0, ymm4, ymm6, 0x20 ; ymm0=in0_4
+ vperm2i128 ymm1, ymm5, ymm4, 0x31 ; ymm1=in3_1
+ vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6
+ vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5
+
+ dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1
+ ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+ dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+ ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows.
+
+ vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5
+ vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1
+
+ dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2
+ ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+ dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+ ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+ vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45
+ vpacksswb ymm1, ymm2, ymm4 ; ymm1=data23_67
+ vpaddb ymm0, ymm0, [rel PB_CENTERJSAMP]
+ vpaddb ymm1, ymm1, [rel PB_CENTERJSAMP]
+
+ vextracti128 xmm6, ymm1, 1 ; xmm3=data67
+ vextracti128 xmm4, ymm0, 1 ; xmm2=data45
+ vextracti128 xmm2, ymm1, 0 ; xmm1=data23
+ vextracti128 xmm0, ymm0, 0 ; xmm0=data01
+
+ vpshufd xmm1, xmm0, 0x4E ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ vpshufd xmm3, xmm2, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ vpshufd xmm5, xmm4, 0x4E ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ vpshufd xmm7, xmm6, 0x4E ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ vzeroupper
+
+ mov eax, r13d
+
+ mov rdxp, JSAMPROW [r12+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r12+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+
+ mov rdxp, JSAMPROW [r12+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r12+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+ mov rdxp, JSAMPROW [r12+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r12+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+ mov rdxp, JSAMPROW [r12+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r12+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+ uncollect_args 4
+ pop_xmm 4
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jidctint-sse2.asm b/media/libjpeg/simd/x86_64/jidctint-sse2.asm
new file mode 100644
index 0000000000..7aa869bc0b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctint-sse2.asm
@@ -0,0 +1,847 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp rbp + 0
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 12
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 4
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, xmm0
+ packsswb xmm1, xmm1
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm5, PASS1_BITS
+
+ movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+ jmp near .column_end
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm4, xmm1 ; xmm1=in2=z2
+ movdqa xmm5, xmm1
+ punpcklwd xmm4, xmm3 ; xmm3=in6=z3
+ punpckhwd xmm5, xmm3
+ movdqa xmm1, xmm4
+ movdqa xmm3, xmm5
+ pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=tmp3L
+ pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
+ pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
+ pmaddwd xmm3, [rel PW_F054_MF130] ; xmm3=tmp2H
+
+ movdqa xmm6, xmm0
+ paddw xmm0, xmm2 ; xmm0=in0+in4
+ psubw xmm6, xmm2 ; xmm6=in0-in4
+
+ pxor xmm7, xmm7
+ pxor xmm2, xmm2
+ punpcklwd xmm7, xmm0 ; xmm7=tmp0L
+ punpckhwd xmm2, xmm0 ; xmm2=tmp0H
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+ psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+ movdqa xmm0, xmm7
+ paddd xmm7, xmm4 ; xmm7=tmp10L
+ psubd xmm0, xmm4 ; xmm0=tmp13L
+ movdqa xmm4, xmm2
+ paddd xmm2, xmm5 ; xmm2=tmp10H
+ psubd xmm4, xmm5 ; xmm4=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
+
+ pxor xmm5, xmm5
+ pxor xmm7, xmm7
+ punpcklwd xmm5, xmm6 ; xmm5=tmp1L
+ punpckhwd xmm7, xmm6 ; xmm7=tmp1H
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+ movdqa xmm2, xmm5
+ paddd xmm5, xmm1 ; xmm5=tmp11L
+ psubd xmm2, xmm1 ; xmm2=tmp12L
+ movdqa xmm0, xmm7
+ paddd xmm7, xmm3 ; xmm7=tmp11H
+ psubd xmm0, xmm3 ; xmm0=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm5, xmm6
+ movdqa xmm7, xmm4
+ paddw xmm5, xmm3 ; xmm5=z3
+ paddw xmm7, xmm1 ; xmm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm2, xmm5
+ movdqa xmm0, xmm5
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm0, xmm7
+ movdqa xmm5, xmm2
+ movdqa xmm7, xmm0
+ pmaddwd xmm2, [rel PW_MF078_F117] ; xmm2=z3L
+ pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3H
+ pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
+ pmaddwd xmm7, [rel PW_F117_F078] ; xmm7=z4H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm2, xmm3
+ movdqa xmm0, xmm3
+ punpcklwd xmm2, xmm4
+ punpckhwd xmm0, xmm4
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm0
+ pmaddwd xmm2, [rel PW_MF060_MF089] ; xmm2=tmp0L
+ pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0H
+ pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3L
+ pmaddwd xmm4, [rel PW_MF089_F060] ; xmm4=tmp3H
+
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
+ paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
+ paddd xmm3, xmm5 ; xmm3=tmp3L
+ paddd xmm4, xmm7 ; xmm4=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
+
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm1
+ punpcklwd xmm2, xmm6
+ punpckhwd xmm0, xmm6
+ movdqa xmm1, xmm2
+ movdqa xmm6, xmm0
+ pmaddwd xmm2, [rel PW_MF050_MF256] ; xmm2=tmp1L
+ pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1H
+ pmaddwd xmm1, [rel PW_MF256_F050] ; xmm1=tmp2L
+ pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
+
+ paddd xmm2, xmm5 ; xmm2=tmp1L
+ paddd xmm0, xmm7 ; xmm0=tmp1H
+ paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
+
+ movdqa xmm2, xmm5
+ movdqa xmm0, xmm7
+ paddd xmm5, xmm3 ; xmm5=data0L
+ paddd xmm7, xmm4 ; xmm7=data0H
+ psubd xmm2, xmm3 ; xmm2=data7L
+ psubd xmm0, xmm4 ; xmm0=data7H
+
+ movdqa xmm3, [rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1]
+
+ paddd xmm5, xmm3
+ paddd xmm7, xmm3
+ psrad xmm5, DESCALE_P1
+ psrad xmm7, DESCALE_P1
+ paddd xmm2, xmm3
+ paddd xmm0, xmm3
+ psrad xmm2, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+ movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
+ movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
+
+ movdqa xmm7, xmm4
+ movdqa xmm0, xmm3
+ paddd xmm4, xmm1 ; xmm4=data1L
+ paddd xmm3, xmm6 ; xmm3=data1H
+ psubd xmm7, xmm1 ; xmm7=data6L
+ psubd xmm0, xmm6 ; xmm0=data6H
+
+ movdqa xmm1, [rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1]
+
+ paddd xmm4, xmm1
+ paddd xmm3, xmm1
+ psrad xmm4, DESCALE_P1
+ psrad xmm3, DESCALE_P1
+ paddd xmm7, xmm1
+ paddd xmm0, xmm1
+ psrad xmm7, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
+ movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
+ movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
+ movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm0
+ paddd xmm3, xmm4 ; xmm3=data2L
+ paddd xmm0, xmm2 ; xmm0=data2H
+ psubd xmm5, xmm4 ; xmm5=data5L
+ psubd xmm6, xmm2 ; xmm6=data5H
+
+ movdqa xmm7, [rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1]
+
+ paddd xmm3, xmm7
+ paddd xmm0, xmm7
+ psrad xmm3, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+ paddd xmm5, xmm7
+ paddd xmm6, xmm7
+ psrad xmm5, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+
+ packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
+ packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
+ movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
+ movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
+ movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
+
+ movdqa xmm0, xmm1
+ movdqa xmm6, xmm4
+ paddd xmm1, xmm2 ; xmm1=data3L
+ paddd xmm4, xmm7 ; xmm4=data3H
+ psubd xmm0, xmm2 ; xmm0=data4L
+ psubd xmm6, xmm7 ; xmm6=data4H
+
+ movdqa xmm2, [rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1]
+
+ paddd xmm1, xmm2
+ paddd xmm4, xmm2
+ psrad xmm1, DESCALE_P1
+ psrad xmm4, DESCALE_P1
+ paddd xmm0, xmm2
+ paddd xmm6, xmm2
+ psrad xmm0, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+
+ packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
+ packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
+ movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
+
+ movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
+ punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
+ movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
+
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov rax, [original_rbp]
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+
+ ; -- Even part
+
+ ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm6, xmm1 ; xmm1=in2=z2
+ movdqa xmm5, xmm1
+ punpcklwd xmm6, xmm2 ; xmm2=in6=z3
+ punpckhwd xmm5, xmm2
+ movdqa xmm1, xmm6
+ movdqa xmm2, xmm5
+ pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=tmp3L
+ pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
+ pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
+ pmaddwd xmm2, [rel PW_F054_MF130] ; xmm2=tmp2H
+
+ movdqa xmm3, xmm7
+ paddw xmm7, xmm0 ; xmm7=in0+in4
+ psubw xmm3, xmm0 ; xmm3=in0-in4
+
+ pxor xmm4, xmm4
+ pxor xmm0, xmm0
+ punpcklwd xmm4, xmm7 ; xmm4=tmp0L
+ punpckhwd xmm0, xmm7 ; xmm0=tmp0H
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+ psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm6 ; xmm4=tmp10L
+ psubd xmm7, xmm6 ; xmm7=tmp13L
+ movdqa xmm6, xmm0
+ paddd xmm0, xmm5 ; xmm0=tmp10H
+ psubd xmm6, xmm5 ; xmm6=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
+
+ pxor xmm5, xmm5
+ pxor xmm4, xmm4
+ punpcklwd xmm5, xmm3 ; xmm5=tmp1L
+ punpckhwd xmm4, xmm3 ; xmm4=tmp1H
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+ movdqa xmm0, xmm5
+ paddd xmm5, xmm1 ; xmm5=tmp11L
+ psubd xmm0, xmm1 ; xmm0=tmp12L
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm2 ; xmm4=tmp11H
+ psubd xmm7, xmm2 ; xmm7=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
+ movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
+ movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
+ movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
+
+ movdqa xmm5, xmm6
+ movdqa xmm4, xmm3
+ paddw xmm5, xmm1 ; xmm5=z3
+ paddw xmm4, xmm2 ; xmm4=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm0, xmm5
+ movdqa xmm7, xmm5
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm7, xmm4
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm7
+ pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3L
+ pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3H
+ pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
+ pmaddwd xmm4, [rel PW_F117_F078] ; xmm4=z4H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm0, xmm1
+ movdqa xmm7, xmm1
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm7
+ pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0L
+ pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp0H
+ pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp3L
+ pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3H
+
+ paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
+ paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
+ paddd xmm1, xmm5 ; xmm1=tmp3L
+ paddd xmm3, xmm4 ; xmm3=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
+
+ movdqa xmm0, xmm2
+ movdqa xmm7, xmm2
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm7, xmm6
+ movdqa xmm2, xmm0
+ movdqa xmm6, xmm7
+ pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1L
+ pmaddwd xmm7, [rel PW_MF050_MF256] ; xmm7=tmp1H
+ pmaddwd xmm2, [rel PW_MF256_F050] ; xmm2=tmp2L
+ pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
+
+ paddd xmm0, xmm5 ; xmm0=tmp1L
+ paddd xmm7, xmm4 ; xmm7=tmp1H
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
+
+ movdqa xmm0, xmm5
+ movdqa xmm7, xmm4
+ paddd xmm5, xmm1 ; xmm5=data0L
+ paddd xmm4, xmm3 ; xmm4=data0H
+ psubd xmm0, xmm1 ; xmm0=data7L
+ psubd xmm7, xmm3 ; xmm7=data7H
+
+ movdqa xmm1, [rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2]
+
+ paddd xmm5, xmm1
+ paddd xmm4, xmm1
+ psrad xmm5, DESCALE_P2
+ psrad xmm4, DESCALE_P2
+ paddd xmm0, xmm1
+ paddd xmm7, xmm1
+ psrad xmm0, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
+ packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
+ movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
+
+ movdqa xmm4, xmm3
+ movdqa xmm7, xmm1
+ paddd xmm3, xmm2 ; xmm3=data1L
+ paddd xmm1, xmm6 ; xmm1=data1H
+ psubd xmm4, xmm2 ; xmm4=data6L
+ psubd xmm7, xmm6 ; xmm7=data6H
+
+ movdqa xmm2, [rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2]
+
+ paddd xmm3, xmm2
+ paddd xmm1, xmm2
+ psrad xmm3, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm4, xmm2
+ paddd xmm7, xmm2
+ psrad xmm4, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+ packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
+ movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
+ movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm4, xmm6
+ movdqa xmm0, xmm2
+ paddd xmm6, xmm1 ; xmm6=data2L
+ paddd xmm2, xmm7 ; xmm2=data2H
+ psubd xmm4, xmm1 ; xmm4=data5L
+ psubd xmm0, xmm7 ; xmm0=data5H
+
+ movdqa xmm5, [rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2]
+
+ paddd xmm6, xmm5
+ paddd xmm2, xmm5
+ psrad xmm6, DESCALE_P2
+ psrad xmm2, DESCALE_P2
+ paddd xmm4, xmm5
+ paddd xmm0, xmm5
+ psrad xmm4, DESCALE_P2
+ psrad xmm0, DESCALE_P2
+
+ packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
+ packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+ movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
+ movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
+ movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
+ movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
+
+ movdqa xmm2, xmm3
+ movdqa xmm0, xmm1
+ paddd xmm3, xmm7 ; xmm3=data3L
+ paddd xmm1, xmm5 ; xmm1=data3H
+ psubd xmm2, xmm7 ; xmm2=data4L
+ psubd xmm0, xmm5 ; xmm0=data4H
+
+ movdqa xmm7, [rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2]
+
+ paddd xmm3, xmm7
+ paddd xmm1, xmm7
+ psrad xmm3, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm2, xmm7
+ paddd xmm0, xmm7
+ psrad xmm2, DESCALE_P2
+ psrad xmm0, DESCALE_P2
+
+ movdqa xmm5, [rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP]
+
+ packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
+ packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm7, xmm5
+ paddb xmm1, xmm5
+ paddb xmm6, xmm5
+ paddb xmm3, xmm5
+
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
+ punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
+ punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
+ punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+ mov rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+ mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+ mov rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jidctred-sse2.asm b/media/libjpeg/simd/x86_64/jidctred-sse2.asm
new file mode 100644
index 0000000000..4ece9d891c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctred-sse2.asm
@@ -0,0 +1,574 @@
+;
+; jidctred.asm - reduced-size IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ 1730 ; FIX(0.211164243)
+F_0_509 equ 4176 ; FIX(0.509795579)
+F_0_601 equ 4926 ; FIX(0.601344887)
+F_0_720 equ 5906 ; FIX(0.720959822)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_850 equ 6967 ; FIX(0.850430095)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_061 equ 8697 ; FIX(1.061594337)
+F_1_272 equ 10426 ; FIX(1.272758580)
+F_1_451 equ 11893 ; FIX(1.451774981)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_2_172 equ 17799 ; FIX(2.172734803)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_624 equ 29692 ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 32
+ GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076 times 4 dw F_1_847, -F_0_765
+PW_F256_F089 times 4 dw F_2_562, F_0_899
+PW_F106_MF217 times 4 dw F_1_061, -F_2_172
+PW_MF060_MF050 times 4 dw -F_0_601, -F_0_509
+PW_F145_MF021 times 4 dw F_1_451, -F_0_211
+PW_F362_MF127 times 4 dw F_3_624, -F_1_272
+PW_F085_MF072 times 4 dw F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 32
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp rbp + 0
+%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+ ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args 4
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, xmm1
+ packsswb xmm0, xmm0
+ packsswb xmm0, xmm0
+ movd eax, xmm0
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm0, PASS1_BITS
+
+ movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+ pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+ pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+ pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+ jmp near .column_end
+%endif
+.columnDCT:
+
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm0
+ punpcklwd xmm4, xmm1
+ punpckhwd xmm5, xmm1
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ pmaddwd xmm4, [rel PW_F256_F089] ; xmm4=(tmp2L)
+ pmaddwd xmm5, [rel PW_F256_F089] ; xmm5=(tmp2H)
+ pmaddwd xmm0, [rel PW_F106_MF217] ; xmm0=(tmp0L)
+ pmaddwd xmm1, [rel PW_F106_MF217] ; xmm1=(tmp0H)
+
+ movdqa xmm6, xmm2
+ movdqa xmm7, xmm2
+ punpcklwd xmm6, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm7
+ pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2L)
+ pmaddwd xmm7, [rel PW_MF060_MF050] ; xmm7=(tmp2H)
+ pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0L)
+ pmaddwd xmm3, [rel PW_F145_MF021] ; xmm3=(tmp0H)
+
+ paddd xmm6, xmm4 ; xmm6=tmp2L
+ paddd xmm7, xmm5 ; xmm7=tmp2H
+ paddd xmm2, xmm0 ; xmm2=tmp0L
+ paddd xmm3, xmm1 ; xmm3=tmp0H
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ pxor xmm1, xmm1
+ pxor xmm2, xmm2
+ punpcklwd xmm1, xmm4 ; xmm1=tmp0L
+ punpckhwd xmm2, xmm4 ; xmm2=tmp0H
+ psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+ psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+ movdqa xmm3, xmm5 ; xmm5=in2=z2
+ punpcklwd xmm5, xmm0 ; xmm0=in6=z3
+ punpckhwd xmm3, xmm0
+ pmaddwd xmm5, [rel PW_F184_MF076] ; xmm5=tmp2L
+ pmaddwd xmm3, [rel PW_F184_MF076] ; xmm3=tmp2H
+
+ movdqa xmm4, xmm1
+ movdqa xmm0, xmm2
+ paddd xmm1, xmm5 ; xmm1=tmp10L
+ paddd xmm2, xmm3 ; xmm2=tmp10H
+ psubd xmm4, xmm5 ; xmm4=tmp12L
+ psubd xmm0, xmm3 ; xmm0=tmp12H
+
+ ; -- Final output stage
+
+ movdqa xmm5, xmm1
+ movdqa xmm3, xmm2
+ paddd xmm1, xmm6 ; xmm1=data0L
+ paddd xmm2, xmm7 ; xmm2=data0H
+ psubd xmm5, xmm6 ; xmm5=data3L
+ psubd xmm3, xmm7 ; xmm3=data3H
+
+ movdqa xmm6, [rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
+
+ paddd xmm1, xmm6
+ paddd xmm2, xmm6
+ psrad xmm1, DESCALE_P1_4
+ psrad xmm2, DESCALE_P1_4
+ paddd xmm5, xmm6
+ paddd xmm3, xmm6
+ psrad xmm5, DESCALE_P1_4
+ psrad xmm3, DESCALE_P1_4
+
+ packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
+
+ movdqa xmm2, xmm4
+ movdqa xmm3, xmm0
+ paddd xmm4, xmm7 ; xmm4=data1L
+ paddd xmm0, xmm6 ; xmm0=data1H
+ psubd xmm2, xmm7 ; xmm2=data2L
+ psubd xmm3, xmm6 ; xmm3=data2H
+
+ movdqa xmm7, [rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
+
+ paddd xmm4, xmm7
+ paddd xmm0, xmm7
+ psrad xmm4, DESCALE_P1_4
+ psrad xmm0, DESCALE_P1_4
+ paddd xmm2, xmm7
+ paddd xmm3, xmm7
+ psrad xmm2, DESCALE_P1_4
+ psrad xmm3, DESCALE_P1_4
+
+ packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+ movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov rax, [original_rbp]
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+
+ ; -- Even part
+
+ pxor xmm4, xmm4
+ punpcklwd xmm4, xmm1 ; xmm4=tmp0
+ psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+ ; -- Odd part
+
+ punpckhwd xmm1, xmm0
+ punpckhwd xmm6, xmm3
+ movdqa xmm5, xmm1
+ movdqa xmm2, xmm6
+ pmaddwd xmm1, [rel PW_F256_F089] ; xmm1=(tmp2)
+ pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2)
+ pmaddwd xmm5, [rel PW_F106_MF217] ; xmm5=(tmp0)
+ pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0)
+
+ paddd xmm6, xmm1 ; xmm6=tmp2
+ paddd xmm2, xmm5 ; xmm2=tmp0
+
+ ; -- Even part
+
+ punpcklwd xmm0, xmm3
+ pmaddwd xmm0, [rel PW_F184_MF076] ; xmm0=tmp2
+
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm0 ; xmm4=tmp10
+ psubd xmm7, xmm0 ; xmm7=tmp12
+
+ ; -- Final output stage
+
+ movdqa xmm1, [rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
+
+ movdqa xmm5, xmm4
+ movdqa xmm3, xmm7
+ paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
+ paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
+ psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
+ psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
+
+ paddd xmm4, xmm1
+ paddd xmm7, xmm1
+ psrad xmm4, DESCALE_P2_4
+ psrad xmm7, DESCALE_P2_4
+ paddd xmm5, xmm1
+ paddd xmm3, xmm1
+ psrad xmm5, DESCALE_P2_4
+ psrad xmm3, DESCALE_P2_4
+
+ packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
+ packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
+
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
+ punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
+
+ packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+ paddb xmm4, [rel PB_CENTERJSAMP]
+
+ pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+ pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+ pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+ mov rdxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+ movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+ uncollect_args 4
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 4
+ push rbx
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+ ; | input: | result: |
+ ; | 00 01 ** 03 ** 05 ** 07 | |
+ ; | 10 11 ** 13 ** 15 ** 17 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+ ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+ ; | 50 51 ** 53 ** 55 ** 57 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 70 71 ** 73 ** 75 ** 77 | |
+
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+ ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+ pcmpeqd xmm7, xmm7
+ pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+ movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
+ movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
+ punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
+ punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
+ pmaddwd xmm4, [rel PW_F362_MF127]
+ pmaddwd xmm5, [rel PW_F085_MF072]
+
+ psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
+ pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+ psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
+ pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+ por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
+ por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
+ pmaddwd xmm0, [rel PW_F362_MF127]
+ pmaddwd xmm2, [rel PW_F085_MF072]
+
+ paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
+ paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
+
+ ; -- Even part
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+ movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
+ pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
+ pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+ psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+ psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+ ; -- Final output stage
+
+ movdqa xmm3, xmm6
+ movdqa xmm5, xmm1
+ paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+ paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+ psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+ psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+ movdqa xmm2, [rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
+
+ punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
+
+ movdqa xmm7, xmm1
+ punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
+ punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
+
+ paddd xmm6, xmm2
+ psrad xmm6, DESCALE_P1_2
+
+ paddd xmm1, xmm2
+ paddd xmm7, xmm2
+ psrad xmm1, DESCALE_P1_2
+ psrad xmm7, DESCALE_P1_2
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+
+ ; | input:| result:|
+ ; | A0 B0 | |
+ ; | A1 B1 | C0 C1 |
+ ; | A3 B3 | D0 D1 |
+ ; | A5 B5 | |
+ ; | A7 B7 | |
+
+ ; -- Odd part
+
+ packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+ packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+ pmaddwd xmm1, [rel PW_F362_MF127]
+ pmaddwd xmm7, [rel PW_F085_MF072]
+
+ paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
+
+ ; -- Even part
+
+ pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
+
+ ; -- Final output stage
+
+ movdqa xmm4, xmm6
+ paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+ psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+ punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
+
+ paddd xmm6, [rel PD_DESCALE_P2_2]
+ psrad xmm6, DESCALE_P2_2
+
+ packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+ packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+ paddb xmm6, [rel PB_CENTERJSAMP]
+
+ pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
+ pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
+
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov word [rdx+rax*SIZEOF_JSAMPLE], bx
+ mov word [rsi+rax*SIZEOF_JSAMPLE], cx
+
+ pop rbx
+ uncollect_args 4
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jquantf-sse2.asm b/media/libjpeg/simd/x86_64/jquantf-sse2.asm
new file mode 100644
index 0000000000..ab2e3954f6
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquantf-sse2.asm
@@ -0,0 +1,155 @@
+;
+; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+; FAST_FLOAT *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 3
+ push rbx
+
+ pcmpeqw xmm7, xmm7
+ psllw xmm7, 7
+ packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov rsi, r10
+ mov eax, r11d
+ mov rdi, r12
+ mov rcx, DCTSIZE/2
+.convloop:
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+ movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+ psubb xmm0, xmm7 ; xmm0=(01234567)
+ psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
+
+ punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
+ punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
+
+ punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
+ punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
+ punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
+ punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
+
+ psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
+ psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
+ cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
+ cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
+ psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
+ psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
+ cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
+ cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
+
+ movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+ add rsi, byte 2*SIZEOF_JSAMPROW
+ add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec rcx
+ jnz short .convloop
+
+ pop rbx
+ uncollect_args 3
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+; FAST_FLOAT *workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT *divisors
+; r12 = FAST_FLOAT *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 3
+
+ mov rsi, r12
+ mov rdx, r11
+ mov rdi, r10
+ mov rax, DCTSIZE2/16
+.quantloop:
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+ mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+ mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+ cvtps2dq xmm0, xmm0
+ cvtps2dq xmm1, xmm1
+ cvtps2dq xmm2, xmm2
+ cvtps2dq xmm3, xmm3
+
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+
+ movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+ add rsi, byte 16*SIZEOF_FAST_FLOAT
+ add rdx, byte 16*SIZEOF_FAST_FLOAT
+ add rdi, byte 16*SIZEOF_JCOEF
+ dec rax
+ jnz short .quantloop
+
+ uncollect_args 3
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jquanti-avx2.asm b/media/libjpeg/simd/x86_64/jquanti-avx2.asm
new file mode 100644
index 0000000000..70fe81139c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquanti-avx2.asm
@@ -0,0 +1,163 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 3
+
+ mov eax, r11d
+
+ mov rsip, JSAMPROW [r10+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdip, JSAMPROW [r10+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+ pinsrq xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+ mov rsip, JSAMPROW [r10+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdip, JSAMPROW [r10+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+ pinsrq xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+ mov rsip, JSAMPROW [r10+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdip, JSAMPROW [r10+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+ pinsrq xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+ mov rsip, JSAMPROW [r10+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdip, JSAMPROW [r10+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+ pinsrq xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+ vpmovzxbw ymm0, xmm0 ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ vpmovzxbw ymm1, xmm1 ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ vpmovzxbw ymm2, xmm2 ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ vpmovzxbw ymm3, xmm3 ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ vpcmpeqw ymm7, ymm7, ymm7
+ vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm1, ymm1, ymm7
+ vpaddw ymm2, ymm2, ymm7
+ vpaddw ymm3, ymm3, ymm7
+
+ vmovdqu YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0
+ vmovdqu YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1
+ vmovdqu YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2
+ vmovdqu YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3
+
+ vzeroupper
+ uncollect_args 3
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+; DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+ YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+ YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+ YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 3
+
+ vmovdqu ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
+ vmovdqu ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
+ vmovdqu ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
+ vmovdqu ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
+ vpabsw ymm0, ymm4
+ vpabsw ymm1, ymm5
+ vpabsw ymm2, ymm6
+ vpabsw ymm3, ymm7
+
+ vpaddw ymm0, YMMWORD [CORRECTION(0,0,r11)] ; correction + roundfactor
+ vpaddw ymm1, YMMWORD [CORRECTION(2,0,r11)]
+ vpaddw ymm2, YMMWORD [CORRECTION(4,0,r11)]
+ vpaddw ymm3, YMMWORD [CORRECTION(6,0,r11)]
+ vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,r11)] ; reciprocal
+ vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,r11)]
+ vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,r11)]
+ vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,r11)]
+ vpmulhuw ymm0, YMMWORD [SCALE(0,0,r11)] ; scale
+ vpmulhuw ymm1, YMMWORD [SCALE(2,0,r11)]
+ vpmulhuw ymm2, YMMWORD [SCALE(4,0,r11)]
+ vpmulhuw ymm3, YMMWORD [SCALE(6,0,r11)]
+
+ vpsignw ymm0, ymm0, ymm4
+ vpsignw ymm1, ymm1, ymm5
+ vpsignw ymm2, ymm2, ymm6
+ vpsignw ymm3, ymm3, ymm7
+
+ vmovdqu [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
+ vmovdqu [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
+ vmovdqu [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
+ vmovdqu [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
+
+ vzeroupper
+ uncollect_args 3
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jquanti-sse2.asm b/media/libjpeg/simd/x86_64/jquanti-sse2.asm
new file mode 100644
index 0000000000..3ee442027a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquanti-sse2.asm
@@ -0,0 +1,188 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 3
+ push rbx
+
+ pxor xmm6, xmm6 ; xmm6=(all 0's)
+ pcmpeqw xmm7, xmm7
+ psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ mov rsi, r10
+ mov eax, r11d
+ mov rdi, r12
+ mov rcx, DCTSIZE/4
+.convloop:
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
+ movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
+
+ mov rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
+ movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
+
+ punpcklbw xmm0, xmm6 ; xmm0=(01234567)
+ punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+ punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
+ punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
+ paddw xmm2, xmm7
+ paddw xmm3, xmm7
+
+ movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+ add rsi, byte 4*SIZEOF_JSAMPROW
+ add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec rcx
+ jnz short .convloop
+
+ pop rbx
+ uncollect_args 3
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+; DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+ XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+ XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+ XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+ align 32
+ GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args 3
+
+ mov rsi, r12
+ mov rdx, r11
+ mov rdi, r10
+ mov rax, DCTSIZE2/32
+.quantloop:
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm7
+ psraw xmm4, (WORD_BIT-1)
+ psraw xmm5, (WORD_BIT-1)
+ psraw xmm6, (WORD_BIT-1)
+ psraw xmm7, (WORD_BIT-1)
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pxor xmm3, xmm7
+ psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
+ psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
+ psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
+ psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
+
+ paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
+ paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+ paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+ paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+ pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
+ pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+ pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+ pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+ pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
+ pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
+ pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
+ pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pxor xmm3, xmm7
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+ psubw xmm2, xmm6
+ psubw xmm3, xmm7
+ movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+ add rsi, byte 32*SIZEOF_DCTELEM
+ add rdx, byte 32*SIZEOF_DCTELEM
+ add rdi, byte 32*SIZEOF_JCOEF
+ dec rax
+ jnz near .quantloop
+
+ uncollect_args 3
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
diff --git a/media/libjpeg/simd/x86_64/jsimd.c b/media/libjpeg/simd/x86_64/jsimd.c
new file mode 100644
index 0000000000..3f5ee77eb9
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jsimd.c
@@ -0,0 +1,1110 @@
+/*
+ * jsimd_x86_64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022-2023, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order) (((size_t)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0);
+static THREAD_LOCAL unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char env[2] = { 0 };
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = jpeg_simd_cpu_support();
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
+ simd_support &= JSIMD_SSE2;
+ if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
+ simd_support &= JSIMD_AVX2;
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+ simd_support = 0;
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+ simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ if (simd_support == ~0U)
+ init_simd();
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_extrgb_ycc_convert_avx2;
+ sse2fct = jsimd_extrgb_ycc_convert_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+ sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_extbgr_ycc_convert_avx2;
+ sse2fct = jsimd_extbgr_ycc_convert_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+ sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+ sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+ sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+ break;
+ default:
+ avx2fct = jsimd_rgb_ycc_convert_avx2;
+ sse2fct = jsimd_rgb_ycc_convert_sse2;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ else
+ sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ if (simd_support == ~0U)
+ init_simd();
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_extrgb_gray_convert_avx2;
+ sse2fct = jsimd_extrgb_gray_convert_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_extrgbx_gray_convert_avx2;
+ sse2fct = jsimd_extrgbx_gray_convert_sse2;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_extbgr_gray_convert_avx2;
+ sse2fct = jsimd_extbgr_gray_convert_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_extbgrx_gray_convert_avx2;
+ sse2fct = jsimd_extbgrx_gray_convert_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_extxbgr_gray_convert_avx2;
+ sse2fct = jsimd_extxbgr_gray_convert_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_extxrgb_gray_convert_avx2;
+ sse2fct = jsimd_extxrgb_gray_convert_sse2;
+ break;
+ default:
+ avx2fct = jsimd_rgb_gray_convert_avx2;
+ sse2fct = jsimd_rgb_gray_convert_sse2;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+ else
+ sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+ void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ if (simd_support == ~0U)
+ init_simd();
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_ycc_extrgb_convert_avx2;
+ sse2fct = jsimd_ycc_extrgb_convert_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+ sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_ycc_extbgr_convert_avx2;
+ sse2fct = jsimd_ycc_extbgr_convert_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+ sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+ sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+ sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+ break;
+ default:
+ avx2fct = jsimd_ycc_rgb_convert_avx2;
+ sse2fct = jsimd_ycc_rgb_convert_sse2;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+ else
+ sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+ else
+ jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+ else
+ jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+ else
+ jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+ else
+ jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+ else
+ jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+ else
+ jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) &&
+ IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ if (simd_support == ~0U)
+ init_simd();
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+ break;
+ default:
+ avx2fct = jsimd_h2v2_merged_upsample_avx2;
+ sse2fct = jsimd_h2v2_merged_upsample_sse2;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+ else
+ sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ if (simd_support == ~0U)
+ init_simd();
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+ break;
+ case JCS_EXT_BGR:
+ avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+ break;
+ default:
+ avx2fct = jsimd_h2v1_merged_upsample_avx2;
+ sse2fct = jsimd_h2v1_merged_upsample_sse2;
+ break;
+ }
+
+ if (simd_support & JSIMD_AVX2)
+ avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+ else
+ sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_convsamp_avx2(sample_data, start_col, workspace);
+ else
+ jsimd_convsamp_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+ jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_fdct_islow_avx2(data);
+ else
+ jsimd_fdct_islow_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+ jsimd_fdct_float_sse(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_AVX2)
+ return 1;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_quantize_avx2(coef_block, divisors, workspace);
+ else
+ jsimd_quantize_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+ jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+ return 1;
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+ if (sizeof(FLOAT_MULT_TYPE) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if (simd_support == ~0U)
+ init_simd();
+
+ if (simd_support & JSIMD_AVX2)
+ jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+ else
+ jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+ IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+ jpeg_natural_order_start,
+ Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/x86_64/jsimdcpu.asm b/media/libjpeg/simd/x86_64/jsimdcpu.asm
new file mode 100644
index 0000000000..705f813d7d
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jsimdcpu.asm
@@ -0,0 +1,86 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+ align 32
+ GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+ push rbx
+ push rdi
+
+ xor rdi, rdi ; simd support flag
+
+ ; Assume that all x86-64 processors support SSE & SSE2 instructions
+ or rdi, JSIMD_SSE2
+ or rdi, JSIMD_SSE
+
+ ; Check whether CPUID leaf 07H is supported
+ ; (leaf 07H is used to check for AVX2 instruction support)
+ mov rax, 0
+ cpuid
+ cmp rax, 7
+ jl short .return ; Maximum leaf < 07H
+
+ ; Check for AVX2 instruction support
+ mov rax, 7
+ xor rcx, rcx
+ cpuid
+ mov rax, rbx ; rax = Extended feature flags
+
+ test rax, 1<<5 ; bit5:AVX2
+ jz short .return
+
+ ; Check for AVX2 O/S support
+ mov rax, 1
+ xor rcx, rcx
+ cpuid
+ test rcx, 1<<27
+ jz short .return ; O/S does not support XSAVE
+ test rcx, 1<<28
+ jz short .return ; CPU does not support AVX2
+
+ xor rcx, rcx
+ xgetbv
+ and rax, 6
+ cmp rax, 6 ; O/S does not manage XMM/YMM state
+ ; using XSAVE
+ jnz short .return
+
+ or rdi, JSIMD_AVX2
+
+.return:
+ mov rax, rdi
+
+ pop rdi
+ pop rbx
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32