diff options
Diffstat (limited to 'media/libjpeg/simd')
137 files changed, 66043 insertions, 0 deletions
diff --git a/media/libjpeg/simd/arm/jsimd.c b/media/libjpeg/simd/arm/jsimd.c new file mode 100644 index 0000000000..709656ce1b --- /dev/null +++ b/media/libjpeg/simd/arm/jsimd.c @@ -0,0 +1,721 @@ +/* + * jsimd_arm.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies). + * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2019, Google LLC. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 32-bit Arm architecture. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +#include <stdio.h> +#include <string.h> +#include <ctype.h> + +static unsigned int simd_support = ~0; +static unsigned int simd_huffman = 1; + +#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_feature(char *buffer, char *feature) +{ + char *p; + + if (*feature == 0) + return 0; + if (strncmp(buffer, "Features", 8) != 0) + return 0; + buffer += 8; + while (isspace(*buffer)) + buffer++; + + /* Check if 'feature' is present in the buffer as a separate word */ + while ((p = strstr(buffer, feature))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(feature); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo(int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + + simd_support = 0; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_feature(buffer, "neon")) + simd_support |= JSIMD_NEON; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char *env = NULL; +#endif +#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__ARM_NEON__) + simd_support |= JSIMD_NEON; +#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + /* We still have a chance to use Neon regardless of globally used + * -mcpu/-mfpu options passed to gcc by performing runtime detection via + * /proc/cpuinfo parsing on linux/android */ + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#endif + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCENEON"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_NEON; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; + env = getenv("JSIMD_NOHUFFENC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_huffman = 0; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_extrgb_ycc_convert_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_extrgbx_ycc_convert_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_extbgr_ycc_convert_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_extbgrx_ycc_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_extxbgr_ycc_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_extxrgb_ycc_convert_neon; + break; + default: + neonfct = jsimd_extrgb_ycc_convert_neon; + break; + } + + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_ycc_extrgb_convert_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_ycc_extrgbx_convert_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_ycc_extbgr_convert_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_ycc_extbgrx_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_ycc_extxbgr_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_ycc_extxrgb_convert_neon; + break; + default: + neonfct = jsimd_ycc_extrgb_convert_neon; + break; + } + + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, + output_buf, num_rows); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_neon(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + jsimd_fdct_ifast_neon(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + jsimd_quantize_neon(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_NEON && simd_huffman) + return 1; + + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, + dctbl, actbl); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/media/libjpeg/simd/arm/jsimd_neon.S b/media/libjpeg/simd/arm/jsimd_neon.S new file mode 100644 index 0000000000..30f9cc63e2 --- /dev/null +++ b/media/libjpeg/simd/arm/jsimd_neon.S @@ -0,0 +1,2878 @@ +/* + * Armv7 Neon optimizations for libjpeg-turbo + * + * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). + * All Rights Reserved. + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> + * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved. + * Copyright (C) 2014, Linaro Limited. All Rights Reserved. + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ +#endif + +.text +.fpu neon +.arch armv7a +.object_arch armv4 +.arm +.syntax unified + + +#define RESPECT_STRICT_ALIGNMENT 1 + + +/*****************************************************************************/ + +/* Supplementary macro for setting function attributes */ +.macro asm_function fname +#ifdef __APPLE__ + .private_extern _\fname + .globl _\fname +_\fname: +#else + .global \fname +#ifdef __ELF__ + .hidden \fname + .type \fname, %function +#endif +\fname: +#endif +.endm + +/* Transpose a block of 4x4 coefficients in four 64-bit registers */ +.macro transpose_4x4 x0, x1, x2, x3 + vtrn.16 \x0, \x1 + vtrn.16 \x2, \x3 + vtrn.32 \x0, \x2 + vtrn.32 \x1, \x3 +.endm + + +#define CENTERJSAMPLE 128 + +/*****************************************************************************/ + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + * + * GLOBAL(void) + * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block, + * JSAMPARRAY output_buf, JDIMENSION output_col) + */ + +#define FIX_0_298631336 (2446) +#define FIX_0_390180644 (3196) +#define FIX_0_541196100 (4433) +#define FIX_0_765366865 (6270) +#define FIX_0_899976223 (7373) +#define FIX_1_175875602 (9633) +#define FIX_1_501321110 (12299) +#define FIX_1_847759065 (15137) +#define FIX_1_961570560 (16069) +#define FIX_2_053119869 (16819) +#define FIX_2_562915447 (20995) +#define FIX_3_072711026 (25172) + +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) +#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) + +/* + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. + * Uses some ideas from the comments in 'simd/jiss2int-64.asm' + */ +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \ + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ + JLONG q1, q2, q3, q4, q5, q6, q7; \ + JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \ + \ + /* 1-D iDCT input data */ \ + row0 = xrow0; \ + row1 = xrow1; \ + row2 = xrow2; \ + row3 = xrow3; \ + row4 = xrow4; \ + row5 = xrow5; \ + row6 = xrow6; \ + row7 = xrow7; \ + \ + q5 = row7 + row3; \ + q4 = row5 + row1; \ + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ + MULTIPLY(q4, FIX_1_175875602); \ + q7 = MULTIPLY(q5, FIX_1_175875602) + \ + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ + q2 = MULTIPLY(row2, FIX_0_541196100) + \ + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ + q4 = q6; \ + q3 = ((JLONG)row0 - (JLONG)row4) << 13; \ + q6 += MULTIPLY(row5, -FIX_2_562915447) + \ + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ + /* now we can use q1 (reloadable constants have been used up) */ \ + q1 = q3 + q2; \ + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ + MULTIPLY(row1, -FIX_0_899976223); \ + q5 = q7; \ + q1 = q1 + q6; \ + q7 += MULTIPLY(row7, -FIX_0_899976223) + \ + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ + \ + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ + tmp11_plus_tmp2 = q1; \ + row1 = 0; \ + \ + q1 = q1 - q6; \ + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ + MULTIPLY(row3, -FIX_2_562915447); \ + q1 = q1 - q6; \ + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ + MULTIPLY(row6, FIX_0_541196100); \ + q3 = q3 - q2; \ + \ + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ + tmp11_minus_tmp2 = q1; \ + \ + q1 = ((JLONG)row0 + (JLONG)row4) << 13; \ + q2 = q1 + q6; \ + q1 = q1 - q6; \ + \ + /* pick up the results */ \ + tmp0 = q4; \ + tmp1 = q5; \ + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ + tmp3 = q7; \ + tmp10 = q2; \ + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ + tmp12 = q3; \ + tmp13 = q1; \ +} + +#define XFIX_0_899976223 d0[0] +#define XFIX_0_541196100 d0[1] +#define XFIX_2_562915447 d0[2] +#define XFIX_0_298631336_MINUS_0_899976223 d0[3] +#define XFIX_1_501321110_MINUS_0_899976223 d1[0] +#define XFIX_2_053119869_MINUS_2_562915447 d1[1] +#define XFIX_0_541196100_PLUS_0_765366865 d1[2] +#define XFIX_1_175875602 d1[3] +#define XFIX_1_175875602_MINUS_0_390180644 d2[0] +#define XFIX_0_541196100_MINUS_1_847759065 d2[1] +#define XFIX_3_072711026_MINUS_2_562915447 d2[2] +#define XFIX_1_175875602_MINUS_1_961570560 d2[3] + +.balign 16 +jsimd_idct_islow_neon_consts: + .short FIX_0_899976223 /* d0[0] */ + .short FIX_0_541196100 /* d0[1] */ + .short FIX_2_562915447 /* d0[2] */ + .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ + .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ + .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ + .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ + .short FIX_1_175875602 /* d1[3] */ + /* reloadable constants */ + .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ + .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ + .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ + .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ + +asm_function jsimd_idct_islow_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + ROW0L .req d16 + ROW0R .req d17 + ROW1L .req d18 + ROW1R .req d19 + ROW2L .req d20 + ROW2R .req d21 + ROW3L .req d22 + ROW3R .req d23 + ROW4L .req d24 + ROW4R .req d25 + ROW5L .req d26 + ROW5R .req d27 + ROW6L .req d28 + ROW6R .req d29 + ROW7L .req d30 + ROW7R .req d31 + + /* Load and dequantize coefficients into Neon registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( q8 ) + * 1 | d18 | d19 ( q9 ) + * 2 | d20 | d21 ( q10 ) + * 3 | d22 | d23 ( q11 ) + * 4 | d24 | d25 ( q12 ) + * 5 | d26 | d27 ( q13 ) + * 6 | d28 | d29 ( q14 ) + * 7 | d30 | d31 ( q15 ) + */ + adr ip, jsimd_idct_islow_neon_consts + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! + vmul.s16 q8, q8, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q9, q9, q1 + vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! + vmul.s16 q10, q10, q2 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vmul.s16 q11, q11, q3 + vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] + vmul.s16 q12, q12, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q14, q14, q2 + vmul.s16 q13, q13, q1 + vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ + add ip, ip, #16 + vmul.s16 q15, q15, q3 + vpush {d8 - d15} /* save Neon registers */ + /* 1-D IDCT, pass 1, left 4x8 half */ + vadd.s16 d4, ROW7L, ROW3L + vadd.s16 d5, ROW5L, ROW1L + vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, d5, XFIX_1_175875602 + vmull.s16 q7, d4, XFIX_1_175875602 + /* Check for the zero coefficients in the right 4x8 half */ + push {r4, r5} + vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 + vsubl.s16 q3, ROW0L, ROW4L + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 + orr r0, r4, r5 + vmov q4, q6 + vmlsl.s16 q6, ROW5L, XFIX_2_562915447 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vshl.s32 q3, q3, #13 + orr r0, r0, r4 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + orr r0, r0, r5 + vadd.s32 q1, q3, q2 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] + vmov q5, q7 + vadd.s32 q1, q1, q6 + orr r0, r0, r4 + vmlsl.s16 q7, ROW7L, XFIX_0_899976223 + orr r0, r0, r5 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vrshrn.s32 ROW1L, q1, #11 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 + orr r0, r0, r4 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + orr r0, r0, r5 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] + vmlal.s16 q6, ROW6L, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + orr r0, r0, r4 + vrshrn.s32 ROW6L, q1, #11 + orr r0, r0, r5 + vadd.s32 q1, q3, q5 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0L, ROW4L + orr r0, r0, r4 + vrshrn.s32 ROW2L, q1, #11 + orr r0, r0, r5 + vrshrn.s32 ROW5L, q3, #11 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 + orr r0, r0, r4 + vadd.s32 q2, q5, q6 + orrs r0, r0, r5 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + orr r0, r4, r5 + vsub.s32 q3, q1, q4 + pop {r4, r5} + vrshrn.s32 ROW7L, q2, #11 + vrshrn.s32 ROW3L, q5, #11 + vrshrn.s32 ROW0L, q6, #11 + vrshrn.s32 ROW4L, q3, #11 + + beq 3f /* Go to do some special handling for the sparse + right 4x8 half */ + + /* 1-D IDCT, pass 1, right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vadd.s16 d10, ROW7R, ROW3R + vadd.s16 d8, ROW5R, ROW1R + /* Transpose left 4x8 half */ + vtrn.16 ROW6L, ROW7L + vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, d8, XFIX_1_175875602 + vtrn.16 ROW2L, ROW3L + vmull.s16 q7, d10, XFIX_1_175875602 + vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 + vtrn.16 ROW0L, ROW1L + vsubl.s16 q3, ROW0R, ROW4R + vmull.s16 q2, ROW2R, XFIX_0_541196100 + vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 + vtrn.16 ROW4L, ROW5L + vmov q4, q6 + vmlsl.s16 q6, ROW5R, XFIX_2_562915447 + vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 + vtrn.32 ROW1L, ROW3L + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW1R, XFIX_0_899976223 + vtrn.32 ROW4L, ROW6L + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vtrn.32 ROW0L, ROW2L + vmlsl.s16 q7, ROW7R, XFIX_0_899976223 + vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 + vrshrn.s32 ROW1R, q1, #11 + vtrn.32 ROW5L, ROW7L + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 + vmlsl.s16 q5, ROW3R, XFIX_2_562915447 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 + vmlal.s16 q6, ROW6R, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + vrshrn.s32 ROW6R, q1, #11 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0R, ROW4R + vrshrn.s32 ROW2R, q1, #11 + vrshrn.s32 ROW5R, q3, #11 + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vrshrn.s32 ROW7R, q2, #11 + vrshrn.s32 ROW3R, q5, #11 + vrshrn.s32 ROW0R, q6, #11 + vrshrn.s32 ROW4R, q3, #11 + /* Transpose right 4x8 half */ + vtrn.16 ROW6R, ROW7R + vtrn.16 ROW2R, ROW3R + vtrn.16 ROW0R, ROW1R + vtrn.16 ROW4R, ROW5R + vtrn.32 ROW1R, ROW3R + vtrn.32 ROW4R, ROW6R + vtrn.32 ROW0R, ROW2R + vtrn.32 ROW5R, ROW7R + +1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW1L, XFIX_1_175875602 + vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ + vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW3L, XFIX_1_175875602 + vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ + vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 + vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ + vmov q4, q6 + vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vshrn.s32 ROW1L, q1, #16 + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ + vsub.s32 q3, q3, q2 + vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ + vshrn.s32 ROW2L, q1, #16 + vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW3L, q5, #16 + vshrn.s32 ROW0L, q6, #16 + vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ + /* 1-D IDCT, pass 2, right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW5R, XFIX_1_175875602 + vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ + vmull.s16 q7, ROW7R, XFIX_1_175875602 + vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 + vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ + vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ + vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ + vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 + vmov q4, q6 + vmlsl.s16 q6, ROW5R, XFIX_2_562915447 + vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vmlsl.s16 q7, ROW7R, XFIX_0_899976223 + vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ + vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 + vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ + vmlal.s16 q6, ROW6R, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW6R, q1, #16 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ + vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ + vshrn.s32 ROW5R, q3, #16 + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW7R, q2, #16 + vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ + vshrn.s32 ROW4R, q3, #16 + +2: /* Descale to 8-bit and range limit */ + vqrshrn.s16 d16, q8, #2 + vqrshrn.s16 d17, q9, #2 + vqrshrn.s16 d18, q10, #2 + vqrshrn.s16 d19, q11, #2 + vpop {d8 - d15} /* restore Neon registers */ + vqrshrn.s16 d20, q12, #2 + /* Transpose the final 8-bit samples and do signed->unsigned conversion */ + vtrn.16 q8, q9 + vqrshrn.s16 d21, q13, #2 + vqrshrn.s16 d22, q14, #2 + vmov.u8 q0, #(CENTERJSAMPLE) + vqrshrn.s16 d23, q15, #2 + vtrn.8 d16, d17 + vtrn.8 d18, d19 + vadd.u8 q8, q8, q0 + vadd.u8 q9, q9, q0 + vtrn.16 q10, q11 + /* Store results to the output buffer */ + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d16}, [TMP1] + vtrn.8 d20, d21 + vst1.8 {d17}, [TMP2] + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d18}, [TMP1] + vadd.u8 q10, q10, q0 + vst1.8 {d19}, [TMP2] + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + vtrn.8 d22, d23 + vst1.8 {d20}, [TMP1] + vadd.u8 q11, q11, q0 + vst1.8 {d21}, [TMP2] + vst1.8 {d22}, [TMP3] + vst1.8 {d23}, [TMP4] + bx lr + +3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ + + /* Transpose left 4x8 half */ + vtrn.16 ROW6L, ROW7L + vtrn.16 ROW2L, ROW3L + vtrn.16 ROW0L, ROW1L + vtrn.16 ROW4L, ROW5L + vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ + vtrn.32 ROW1L, ROW3L + vtrn.32 ROW4L, ROW6L + vtrn.32 ROW0L, ROW2L + vtrn.32 ROW5L, ROW7L + + cmp r0, #0 + beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second + pass */ + + /* Only row 0 is non-zero for the right 4x8 half */ + vdup.s16 ROW1R, ROW0R[1] + vdup.s16 ROW2R, ROW0R[2] + vdup.s16 ROW3R, ROW0R[3] + vdup.s16 ROW4R, ROW0R[0] + vdup.s16 ROW5R, ROW0R[1] + vdup.s16 ROW6R, ROW0R[2] + vdup.s16 ROW7R, ROW0R[3] + vdup.s16 ROW0R, ROW0R[0] + b 1b /* Go to 'normal' second pass */ + +4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW1L, XFIX_1_175875602 + vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW3L, XFIX_1_175875602 + vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vshll.s16 q3, ROW0L, #13 + vmov q4, q6 + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vadd.s32 q1, q1, q6 + vadd.s32 q6, q6, q6 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + vshrn.s32 ROW1L, q1, #16 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vshll.s16 q5, ROW0L, #13 + vshrn.s32 ROW2L, q1, #16 + vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW3L, q5, #16 + vshrn.s32 ROW0L, q6, #16 + vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ + /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW5L, XFIX_1_175875602 + vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW7L, XFIX_1_175875602 + vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 + vmull.s16 q2, ROW6L, XFIX_0_541196100 + vshll.s16 q3, ROW4L, #13 + vmov q4, q6 + vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 + vmlsl.s16 q4, ROW5L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 + vadd.s32 q1, q1, q6 + vadd.s32 q6, q6, q6 + vmlsl.s16 q5, ROW7L, XFIX_2_562915447 + vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW6R, q1, #16 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vshll.s16 q5, ROW4L, #13 + vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ + vshrn.s32 ROW5R, q3, #16 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW7R, q2, #16 + vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ + vshrn.s32 ROW4R, q3, #16 + b 2b /* Go to epilogue */ + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + .unreq ROW0L + .unreq ROW0R + .unreq ROW1L + .unreq ROW1R + .unreq ROW2L + .unreq ROW2R + .unreq ROW3L + .unreq ROW3R + .unreq ROW4L + .unreq ROW4R + .unreq ROW5L + .unreq ROW5R + .unreq ROW6L + .unreq ROW6R + .unreq ROW7L + .unreq ROW7R + + +/*****************************************************************************/ + +/* + * jsimd_idct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' + * function from jidctfst.c + * + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. + * But in Arm Neon case some extra additions are required because VQDMULH + * instruction can't handle the constants larger than 1. So the expressions + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", + * which introduces an extra addition. Overall, there are 6 extra additions + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. + */ + +#define XFIX_1_082392200 d0[0] +#define XFIX_1_414213562 d0[1] +#define XFIX_1_847759065 d0[2] +#define XFIX_2_613125930 d0[3] + +.balign 16 +jsimd_idct_ifast_neon_consts: + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ + +asm_function jsimd_idct_ifast_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + /* Load and dequantize coefficients into Neon registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( q8 ) + * 1 | d18 | d19 ( q9 ) + * 2 | d20 | d21 ( q10 ) + * 3 | d22 | d23 ( q11 ) + * 4 | d24 | d25 ( q12 ) + * 5 | d26 | d27 ( q13 ) + * 6 | d28 | d29 ( q14 ) + * 7 | d30 | d31 ( q15 ) + */ + adr ip, jsimd_idct_ifast_neon_consts + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! + vmul.s16 q8, q8, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q9, q9, q1 + vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! + vmul.s16 q10, q10, q2 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vmul.s16 q11, q11, q3 + vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] + vmul.s16 q12, q12, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q14, q14, q2 + vmul.s16 q13, q13, q1 + vld1.16 {d0}, [ip, :64] /* load constants */ + vmul.s16 q15, q15, q3 + vpush {d8 - d13} /* save Neon registers */ + /* 1-D IDCT, pass 1 */ + vsub.s16 q2, q10, q14 + vadd.s16 q14, q10, q14 + vsub.s16 q1, q11, q13 + vadd.s16 q13, q11, q13 + vsub.s16 q5, q9, q15 + vadd.s16 q15, q9, q15 + vqdmulh.s16 q4, q2, XFIX_1_414213562 + vqdmulh.s16 q6, q1, XFIX_2_613125930 + vadd.s16 q3, q1, q1 + vsub.s16 q1, q5, q1 + vadd.s16 q10, q2, q4 + vqdmulh.s16 q4, q1, XFIX_1_847759065 + vsub.s16 q2, q15, q13 + vadd.s16 q3, q3, q6 + vqdmulh.s16 q6, q2, XFIX_1_414213562 + vadd.s16 q1, q1, q4 + vqdmulh.s16 q4, q5, XFIX_1_082392200 + vsub.s16 q10, q10, q14 + vadd.s16 q2, q2, q6 + vsub.s16 q6, q8, q12 + vadd.s16 q12, q8, q12 + vadd.s16 q9, q5, q4 + vadd.s16 q5, q6, q10 + vsub.s16 q10, q6, q10 + vadd.s16 q6, q15, q13 + vadd.s16 q8, q12, q14 + vsub.s16 q3, q6, q3 + vsub.s16 q12, q12, q14 + vsub.s16 q3, q3, q1 + vsub.s16 q1, q9, q1 + vadd.s16 q2, q3, q2 + vsub.s16 q15, q8, q6 + vadd.s16 q1, q1, q2 + vadd.s16 q8, q8, q6 + vadd.s16 q14, q5, q3 + vsub.s16 q9, q5, q3 + vsub.s16 q13, q10, q2 + vadd.s16 q10, q10, q2 + /* Transpose */ + vtrn.16 q8, q9 + vsub.s16 q11, q12, q1 + vtrn.16 q14, q15 + vadd.s16 q12, q12, q1 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q8, q10 + vtrn.32 q13, q15 + vswp d28, d21 + vswp d26, d19 + /* 1-D IDCT, pass 2 */ + vsub.s16 q2, q10, q14 + vswp d30, d23 + vadd.s16 q14, q10, q14 + vswp d24, d17 + vsub.s16 q1, q11, q13 + vadd.s16 q13, q11, q13 + vsub.s16 q5, q9, q15 + vadd.s16 q15, q9, q15 + vqdmulh.s16 q4, q2, XFIX_1_414213562 + vqdmulh.s16 q6, q1, XFIX_2_613125930 + vadd.s16 q3, q1, q1 + vsub.s16 q1, q5, q1 + vadd.s16 q10, q2, q4 + vqdmulh.s16 q4, q1, XFIX_1_847759065 + vsub.s16 q2, q15, q13 + vadd.s16 q3, q3, q6 + vqdmulh.s16 q6, q2, XFIX_1_414213562 + vadd.s16 q1, q1, q4 + vqdmulh.s16 q4, q5, XFIX_1_082392200 + vsub.s16 q10, q10, q14 + vadd.s16 q2, q2, q6 + vsub.s16 q6, q8, q12 + vadd.s16 q12, q8, q12 + vadd.s16 q9, q5, q4 + vadd.s16 q5, q6, q10 + vsub.s16 q10, q6, q10 + vadd.s16 q6, q15, q13 + vadd.s16 q8, q12, q14 + vsub.s16 q3, q6, q3 + vsub.s16 q12, q12, q14 + vsub.s16 q3, q3, q1 + vsub.s16 q1, q9, q1 + vadd.s16 q2, q3, q2 + vsub.s16 q15, q8, q6 + vadd.s16 q1, q1, q2 + vadd.s16 q8, q8, q6 + vadd.s16 q14, q5, q3 + vsub.s16 q9, q5, q3 + vsub.s16 q13, q10, q2 + vpop {d8 - d13} /* restore Neon registers */ + vadd.s16 q10, q10, q2 + vsub.s16 q11, q12, q1 + vadd.s16 q12, q12, q1 + /* Descale to 8-bit and range limit */ + vmov.u8 q0, #0x80 + vqshrn.s16 d16, q8, #5 + vqshrn.s16 d17, q9, #5 + vqshrn.s16 d18, q10, #5 + vqshrn.s16 d19, q11, #5 + vqshrn.s16 d20, q12, #5 + vqshrn.s16 d21, q13, #5 + vqshrn.s16 d22, q14, #5 + vqshrn.s16 d23, q15, #5 + vadd.u8 q8, q8, q0 + vadd.u8 q9, q9, q0 + vadd.u8 q10, q10, q0 + vadd.u8 q11, q11, q0 + /* Transpose the final 8-bit samples */ + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.8 d16, d17 + vtrn.8 d18, d19 + /* Store results to the output buffer */ + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d16}, [TMP1] + vst1.8 {d17}, [TMP2] + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d18}, [TMP1] + vtrn.8 d20, d21 + vst1.8 {d19}, [TMP2] + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + vst1.8 {d20}, [TMP1] + vtrn.8 d22, d23 + vst1.8 {d21}, [TMP2] + vst1.8 {d22}, [TMP3] + vst1.8 {d23}, [TMP4] + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + +/*****************************************************************************/ + +/* + * jsimd_idct_4x4_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular Neon optimized function is + * bit exact compatibility with jpeg-6b. + * + * TODO: a bit better instructions scheduling can be achieved by expanding + * idct_helper/transpose_4x4 macros and reordering instructions, + * but readability will suffer somewhat. + */ + +#define CONST_BITS 13 + +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ + +.balign 16 +jsimd_idct_4x4_neon_consts: + .short FIX_1_847759065 /* d0[0] */ + .short -FIX_0_765366865 /* d0[1] */ + .short -FIX_0_211164243 /* d0[2] */ + .short FIX_1_451774981 /* d0[3] */ + .short -FIX_2_172734803 /* d1[0] */ + .short FIX_1_061594337 /* d1[1] */ + .short -FIX_0_509795579 /* d1[2] */ + .short -FIX_0_601344887 /* d1[3] */ + .short FIX_0_899976223 /* d2[0] */ + .short FIX_2_562915447 /* d2[1] */ + .short 1 << (CONST_BITS + 1) /* d2[2] */ + .short 0 /* d2[3] */ + +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 + vmull.s16 q14, \x4, d2[2] + vmlal.s16 q14, \x8, d0[0] + vmlal.s16 q14, \x14, d0[1] + + vmull.s16 q13, \x16, d1[2] + vmlal.s16 q13, \x12, d1[3] + vmlal.s16 q13, \x10, d2[0] + vmlal.s16 q13, \x6, d2[1] + + vmull.s16 q15, \x4, d2[2] + vmlsl.s16 q15, \x8, d0[0] + vmlsl.s16 q15, \x14, d0[1] + + vmull.s16 q12, \x16, d0[2] + vmlal.s16 q12, \x12, d0[3] + vmlal.s16 q12, \x10, d1[0] + vmlal.s16 q12, \x6, d1[1] + + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + + .if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q14, q14, #\shift + vmovn.s32 \y26, q10 + vmovn.s32 \y29, q14 + .else + vrshrn.s32 \y26, q10, #\shift + vrshrn.s32 \y29, q14, #\shift + .endif + + vadd.s32 q10, q15, q12 + vsub.s32 q15, q15, q12 + + .if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q15, q15, #\shift + vmovn.s32 \y27, q10 + vmovn.s32 \y28, q15 + .else + vrshrn.s32 \y27, q10, #\shift + vrshrn.s32 \y28, q15, #\shift + .endif +.endm + +asm_function jsimd_idct_4x4_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + vpush {d8 - d15} + + /* Load constants (d3 is just used for padding) */ + adr TMP4, jsimd_idct_4x4_neon_consts + vld1.16 {d0, d1, d2, d3}, [TMP4, :128] + + /* Load all COEF_BLOCK into Neon registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d4 | d5 + * 1 | d6 | d7 + * 2 | d8 | d9 + * 3 | d10 | d11 + * 4 | - | - + * 5 | d12 | d13 + * 6 | d14 | d15 + * 7 | d16 | d17 + */ + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! + vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! + vld1.16 {d16, d17}, [COEF_BLOCK, :128]! + /* dequantize */ + vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! + vmul.s16 q2, q2, q9 + vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! + vmul.s16 q3, q3, q10 + vmul.s16 q4, q4, q11 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! + vmul.s16 q5, q5, q12 + vmul.s16 q6, q6, q13 + vld1.16 {d30, d31}, [DCT_TABLE, :128]! + vmul.s16 q7, q7, q14 + vmul.s16 q8, q8, q15 + + /* Pass 1 */ + idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 + transpose_4x4 d4, d6, d8, d10 + idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 + transpose_4x4 d5, d7, d9, d11 + + /* Pass 2 */ + idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 + transpose_4x4 d26, d27, d28, d29 + + /* Range limit */ + vmov.u16 q15, #0x80 + vadd.s16 q13, q13, q15 + vadd.s16 q14, q14, q15 + vqmovun.s16 d26, q13 + vqmovun.s16 d27, q14 + + /* Store results to the output buffer */ + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT + /* We can use much less instructions on little endian systems if the + * OS kernel is not configured to trap unaligned memory accesses + */ + vst1.32 {d26[0]}, [TMP1]! + vst1.32 {d27[0]}, [TMP3]! + vst1.32 {d26[1]}, [TMP2]! + vst1.32 {d27[1]}, [TMP4]! +#else + vst1.8 {d26[0]}, [TMP1]! + vst1.8 {d27[0]}, [TMP3]! + vst1.8 {d26[1]}, [TMP1]! + vst1.8 {d27[1]}, [TMP3]! + vst1.8 {d26[2]}, [TMP1]! + vst1.8 {d27[2]}, [TMP3]! + vst1.8 {d26[3]}, [TMP1]! + vst1.8 {d27[3]}, [TMP3]! + + vst1.8 {d26[4]}, [TMP2]! + vst1.8 {d27[4]}, [TMP4]! + vst1.8 {d26[5]}, [TMP2]! + vst1.8 {d27[5]}, [TMP4]! + vst1.8 {d26[6]}, [TMP2]! + vst1.8 {d27[6]}, [TMP4]! + vst1.8 {d26[7]}, [TMP2]! + vst1.8 {d27[7]}, [TMP4]! +#endif + + vpop {d8 - d15} + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + +.purgem idct_helper + + +/*****************************************************************************/ + +/* + * jsimd_idct_2x2_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular Neon optimized function is + * bit exact compatibility with jpeg-6b. + */ + +.balign 8 +jsimd_idct_2x2_neon_consts: + .short -FIX_0_720959822 /* d0[0] */ + .short FIX_0_850430095 /* d0[1] */ + .short -FIX_1_272758580 /* d0[2] */ + .short FIX_3_624509785 /* d0[3] */ + +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 + vshll.s16 q14, \x4, #15 + vmull.s16 q13, \x6, d0[3] + vmlal.s16 q13, \x10, d0[2] + vmlal.s16 q13, \x12, d0[1] + vmlal.s16 q13, \x16, d0[0] + + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + + .if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q14, q14, #\shift + vmovn.s32 \y26, q10 + vmovn.s32 \y27, q14 + .else + vrshrn.s32 \y26, q10, #\shift + vrshrn.s32 \y27, q14, #\shift + .endif +.endm + +asm_function jsimd_idct_2x2_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req ip + + vpush {d8 - d15} + + /* Load constants */ + adr TMP2, jsimd_idct_2x2_neon_consts + vld1.16 {d0}, [TMP2, :64] + + /* Load all COEF_BLOCK into Neon registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d4 | d5 + * 1 | d6 | d7 + * 2 | - | - + * 3 | d10 | d11 + * 4 | - | - + * 5 | d12 | d13 + * 6 | - | - + * 7 | d16 | d17 + */ + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d10, d11}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d12, d13}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d16, d17}, [COEF_BLOCK, :128]! + /* Dequantize */ + vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! + vmul.s16 q2, q2, q9 + vmul.s16 q3, q3, q10 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d24, d25}, [DCT_TABLE, :128]! + vmul.s16 q5, q5, q12 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d26, d27}, [DCT_TABLE, :128]! + vmul.s16 q6, q6, q13 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d30, d31}, [DCT_TABLE, :128]! + vmul.s16 q8, q8, q15 + + /* Pass 1 */ +#if 0 + idct_helper d4, d6, d10, d12, d16, 13, d4, d6 + transpose_4x4 d4, d6, d8, d10 + idct_helper d5, d7, d11, d13, d17, 13, d5, d7 + transpose_4x4 d5, d7, d9, d11 +#else + vmull.s16 q13, d6, d0[3] + vmlal.s16 q13, d10, d0[2] + vmlal.s16 q13, d12, d0[1] + vmlal.s16 q13, d16, d0[0] + vmull.s16 q12, d7, d0[3] + vmlal.s16 q12, d11, d0[2] + vmlal.s16 q12, d13, d0[1] + vmlal.s16 q12, d17, d0[0] + vshll.s16 q14, d4, #15 + vshll.s16 q15, d5, #15 + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + vrshrn.s32 d4, q10, #13 + vrshrn.s32 d6, q14, #13 + vadd.s32 q10, q15, q12 + vsub.s32 q14, q15, q12 + vrshrn.s32 d5, q10, #13 + vrshrn.s32 d7, q14, #13 + vtrn.16 q2, q3 + vtrn.32 q3, q5 +#endif + + /* Pass 2 */ + idct_helper d4, d6, d10, d7, d11, 20, d26, d27 + + /* Range limit */ + vmov.u16 q15, #0x80 + vadd.s16 q13, q13, q15 + vqmovun.s16 d26, q13 + vqmovun.s16 d27, q13 + + /* Store results to the output buffer */ + ldmia OUTPUT_BUF, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + + vst1.8 {d26[0]}, [TMP1]! + vst1.8 {d27[4]}, [TMP1]! + vst1.8 {d26[1]}, [TMP2]! + vst1.8 {d27[5]}, [TMP2]! + + vpop {d8 - d15} + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + +.purgem idct_helper + + +/*****************************************************************************/ + +/* + * jsimd_ycc_extrgb_convert_neon + * jsimd_ycc_extbgr_convert_neon + * jsimd_ycc_extrgbx_convert_neon + * jsimd_ycc_extbgrx_convert_neon + * jsimd_ycc_extxbgr_convert_neon + * jsimd_ycc_extxrgb_convert_neon + * + * Colorspace conversion YCbCr -> RGB + */ + + +.macro do_load size + .if \size == 8 + vld1.8 {d4}, [U, :64]! + vld1.8 {d5}, [V, :64]! + vld1.8 {d0}, [Y, :64]! + pld [U, #64] + pld [V, #64] + pld [Y, #64] + .elseif \size == 4 + vld1.8 {d4[0]}, [U]! + vld1.8 {d4[1]}, [U]! + vld1.8 {d4[2]}, [U]! + vld1.8 {d4[3]}, [U]! + vld1.8 {d5[0]}, [V]! + vld1.8 {d5[1]}, [V]! + vld1.8 {d5[2]}, [V]! + vld1.8 {d5[3]}, [V]! + vld1.8 {d0[0]}, [Y]! + vld1.8 {d0[1]}, [Y]! + vld1.8 {d0[2]}, [Y]! + vld1.8 {d0[3]}, [Y]! + .elseif \size == 2 + vld1.8 {d4[4]}, [U]! + vld1.8 {d4[5]}, [U]! + vld1.8 {d5[4]}, [V]! + vld1.8 {d5[5]}, [V]! + vld1.8 {d0[4]}, [Y]! + vld1.8 {d0[5]}, [Y]! + .elseif \size == 1 + vld1.8 {d4[6]}, [U]! + vld1.8 {d5[6]}, [V]! + vld1.8 {d0[6]}, [Y]! + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_store bpp, size + .if \bpp == 24 + .if \size == 8 + vst3.8 {d10, d11, d12}, [RGB]! + .elseif \size == 4 + vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! + vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! + vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! + vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! + .elseif \size == 2 + vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! + vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! + .elseif \size == 1 + vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + vst4.8 {d10, d11, d12, d13}, [RGB]! + .elseif \size == 4 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! + .elseif \size == 2 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! + .elseif \size == 1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 16 + .if \size == 8 + vst1.16 {q15}, [RGB]! + .elseif \size == 4 + vst1.16 {d30}, [RGB]! + .elseif \size == 2 + vst1.16 {d31[0]}, [RGB]! + vst1.16 {d31[1]}, [RGB]! + .elseif \size == 1 + vst1.16 {d31[2]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs + +/* + * 2-stage pipelined YCbCr->RGB conversion + */ + +.macro do_yuv_to_rgb_stage1 + vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ + vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ + vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ + vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ + vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ + vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ + vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ + vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ + vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ + vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb_stage2 + vrshrn.s32 d20, q10, #15 + vrshrn.s32 d21, q11, #15 + vrshrn.s32 d24, q12, #14 + vrshrn.s32 d25, q13, #14 + vrshrn.s32 d28, q14, #14 + vrshrn.s32 d29, q15, #14 + vaddw.u8 q11, q10, d0 + vaddw.u8 q12, q12, d0 + vaddw.u8 q14, q14, d0 + .if \bpp != 16 + vqmovun.s16 d1\g_offs, q11 + vqmovun.s16 d1\r_offs, q12 + vqmovun.s16 d1\b_offs, q14 + .else /* rgb565 */ + vqshlu.s16 q13, q11, #8 + vqshlu.s16 q15, q12, #8 + vqshlu.s16 q14, q14, #8 + vsri.u16 q15, q13, #5 + vsri.u16 q15, q14, #11 + .endif +.endm + +.macro do_yuv_to_rgb_stage2_store_load_stage1 + /* "do_yuv_to_rgb_stage2" and "store" */ + vrshrn.s32 d20, q10, #15 + /* "load" and "do_yuv_to_rgb_stage1" */ + pld [U, #64] + vrshrn.s32 d21, q11, #15 + pld [V, #64] + vrshrn.s32 d24, q12, #14 + vrshrn.s32 d25, q13, #14 + vld1.8 {d4}, [U, :64]! + vrshrn.s32 d28, q14, #14 + vld1.8 {d5}, [V, :64]! + vrshrn.s32 d29, q15, #14 + vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ + vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ + vaddw.u8 q11, q10, d0 + vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ + vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ + vaddw.u8 q12, q12, d0 + vaddw.u8 q14, q14, d0 + .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ + vqmovun.s16 d1\g_offs, q11 + pld [Y, #64] + vqmovun.s16 d1\r_offs, q12 + vld1.8 {d0}, [Y, :64]! + vqmovun.s16 d1\b_offs, q14 + vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ + vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ + do_store \bpp, 8 + vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ + vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ + vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ + vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ + .else /**************************** rgb565 ********************************/ + vqshlu.s16 q13, q11, #8 + pld [Y, #64] + vqshlu.s16 q15, q12, #8 + vqshlu.s16 q14, q14, #8 + vld1.8 {d0}, [Y, :64]! + vmull.s16 q11, d7, d1[1] + vmlal.s16 q11, d9, d1[2] + vsri.u16 q15, q13, #5 + vmull.s16 q12, d8, d1[0] + vsri.u16 q15, q14, #11 + vmull.s16 q13, d9, d1[0] + vmull.s16 q14, d6, d1[3] + do_store \bpp, 8 + vmull.s16 q15, d7, d1[3] + .endif +.endm + +.macro do_yuv_to_rgb + do_yuv_to_rgb_stage1 + do_yuv_to_rgb_stage2 +.endm + +/* Apple gas crashes on adrl, work around that by using adr. + * But this requires a copy of these constants for each function. + */ + +.balign 16 +jsimd_ycc_\colorid\()_neon_consts: + .short 0, 0, 0, 0 + .short 22971, -11277, -23401, 29033 + .short -128, -128, -128, -128 + .short -128, -128, -128, -128 + +asm_function jsimd_ycc_\colorid\()_convert_neon + OUTPUT_WIDTH .req r0 + INPUT_BUF .req r1 + INPUT_ROW .req r2 + OUTPUT_BUF .req r3 + NUM_ROWS .req r4 + + INPUT_BUF0 .req r5 + INPUT_BUF1 .req r6 + INPUT_BUF2 .req INPUT_BUF + + RGB .req r7 + Y .req r8 + U .req r9 + V .req r10 + N .req ip + + /* Load constants to d1, d2, d3 (d0 is just used for padding) */ + adr ip, jsimd_ycc_\colorid\()_neon_consts + vld1.16 {d0, d1, d2, d3}, [ip, :128] + + /* Save Arm registers and handle input arguments */ + push {r4, r5, r6, r7, r8, r9, r10, lr} + ldr NUM_ROWS, [sp, #(4 * 8)] + ldr INPUT_BUF0, [INPUT_BUF] + ldr INPUT_BUF1, [INPUT_BUF, #4] + ldr INPUT_BUF2, [INPUT_BUF, #8] + .unreq INPUT_BUF + + /* Save Neon registers */ + vpush {d8 - d15} + + /* Initially set d10, d11, d12, d13 to 0xFF */ + vmov.u8 q5, #255 + vmov.u8 q6, #255 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + blt 9f +0: + ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] + ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] + mov N, OUTPUT_WIDTH + ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] + add INPUT_ROW, INPUT_ROW, #1 + ldr RGB, [OUTPUT_BUF], #4 + + /* Inner loop over pixels */ + subs N, N, #8 + blt 3f + do_load 8 + do_yuv_to_rgb_stage1 + subs N, N, #8 + blt 2f +1: + do_yuv_to_rgb_stage2_store_load_stage1 + subs N, N, #8 + bge 1b +2: + do_yuv_to_rgb_stage2 + do_store \bpp, 8 + tst N, #7 + beq 8f +3: + tst N, #4 + beq 3f + do_load 4 +3: + tst N, #2 + beq 4f + do_load 2 +4: + tst N, #1 + beq 5f + do_load 1 +5: + do_yuv_to_rgb + tst N, #4 + beq 6f + do_store \bpp, 4 +6: + tst N, #2 + beq 7f + do_store \bpp, 2 +7: + tst N, #1 + beq 8f + do_store \bpp, 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + bgt 0b +9: + /* Restore all registers and return */ + vpop {d8 - d15} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + + .unreq OUTPUT_WIDTH + .unreq INPUT_ROW + .unreq OUTPUT_BUF + .unreq NUM_ROWS + .unreq INPUT_BUF0 + .unreq INPUT_BUF1 + .unreq INPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_yuv_to_rgb +.purgem do_yuv_to_rgb_stage1 +.purgem do_yuv_to_rgb_stage2 +.purgem do_yuv_to_rgb_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B */ +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * jsimd_extrgb_ycc_convert_neon + * jsimd_extbgr_ycc_convert_neon + * jsimd_extrgbx_ycc_convert_neon + * jsimd_extbgrx_ycc_convert_neon + * jsimd_extxbgr_ycc_convert_neon + * jsimd_extxrgb_ycc_convert_neon + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro do_store size + .if \size == 8 + vst1.8 {d20}, [Y]! + vst1.8 {d21}, [U]! + vst1.8 {d22}, [V]! + .elseif \size == 4 + vst1.8 {d20[0]}, [Y]! + vst1.8 {d20[1]}, [Y]! + vst1.8 {d20[2]}, [Y]! + vst1.8 {d20[3]}, [Y]! + vst1.8 {d21[0]}, [U]! + vst1.8 {d21[1]}, [U]! + vst1.8 {d21[2]}, [U]! + vst1.8 {d21[3]}, [U]! + vst1.8 {d22[0]}, [V]! + vst1.8 {d22[1]}, [V]! + vst1.8 {d22[2]}, [V]! + vst1.8 {d22[3]}, [V]! + .elseif \size == 2 + vst1.8 {d20[4]}, [Y]! + vst1.8 {d20[5]}, [Y]! + vst1.8 {d21[4]}, [U]! + vst1.8 {d21[5]}, [U]! + vst1.8 {d22[4]}, [V]! + vst1.8 {d22[5]}, [V]! + .elseif \size == 1 + vst1.8 {d20[6]}, [Y]! + vst1.8 {d21[6]}, [U]! + vst1.8 {d22[6]}, [V]! + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_load bpp, size + .if \bpp == 24 + .if \size == 8 + vld3.8 {d10, d11, d12}, [RGB]! + pld [RGB, #128] + .elseif \size == 4 + vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! + vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! + vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! + vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! + .elseif \size == 2 + vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! + vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! + .elseif \size == 1 + vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + vld4.8 {d10, d11, d12, d13}, [RGB]! + pld [RGB, #128] + .elseif \size == 4 + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! + .elseif \size == 2 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! + .elseif \size == 1 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs + +/* + * 2-stage pipelined RGB->YCbCr conversion + */ + +.macro do_rgb_to_yuv_stage1 + vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ + vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ + vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ + vmull.u16 q7, d4, d0[0] + vmlal.u16 q7, d6, d0[1] + vmlal.u16 q7, d8, d0[2] + vmull.u16 q8, d5, d0[0] + vmlal.u16 q8, d7, d0[1] + vmlal.u16 q8, d9, d0[2] + vrev64.32 q9, q1 + vrev64.32 q13, q1 + vmlsl.u16 q9, d4, d0[3] + vmlsl.u16 q9, d6, d1[0] + vmlal.u16 q9, d8, d1[1] + vmlsl.u16 q13, d5, d0[3] + vmlsl.u16 q13, d7, d1[0] + vmlal.u16 q13, d9, d1[1] + vrev64.32 q14, q1 + vrev64.32 q15, q1 + vmlal.u16 q14, d4, d1[1] + vmlsl.u16 q14, d6, d1[2] + vmlsl.u16 q14, d8, d1[3] + vmlal.u16 q15, d5, d1[1] + vmlsl.u16 q15, d7, d1[2] + vmlsl.u16 q15, d9, d1[3] +.endm + +.macro do_rgb_to_yuv_stage2 + vrshrn.u32 d20, q7, #16 + vrshrn.u32 d21, q8, #16 + vshrn.u32 d22, q9, #16 + vshrn.u32 d23, q13, #16 + vshrn.u32 d24, q14, #16 + vshrn.u32 d25, q15, #16 + vmovn.u16 d20, q10 /* d20 = y */ + vmovn.u16 d21, q11 /* d21 = u */ + vmovn.u16 d22, q12 /* d22 = v */ +.endm + +.macro do_rgb_to_yuv + do_rgb_to_yuv_stage1 + do_rgb_to_yuv_stage2 +.endm + +.macro do_rgb_to_yuv_stage2_store_load_stage1 + vrshrn.u32 d20, q7, #16 + vrshrn.u32 d21, q8, #16 + vshrn.u32 d22, q9, #16 + vrev64.32 q9, q1 + vshrn.u32 d23, q13, #16 + vrev64.32 q13, q1 + vshrn.u32 d24, q14, #16 + vshrn.u32 d25, q15, #16 + do_load \bpp, 8 + vmovn.u16 d20, q10 /* d20 = y */ + vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ + vmovn.u16 d21, q11 /* d21 = u */ + vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ + vmovn.u16 d22, q12 /* d22 = v */ + vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ + vmull.u16 q7, d4, d0[0] + vmlal.u16 q7, d6, d0[1] + vmlal.u16 q7, d8, d0[2] + vst1.8 {d20}, [Y]! + vmull.u16 q8, d5, d0[0] + vmlal.u16 q8, d7, d0[1] + vmlal.u16 q8, d9, d0[2] + vmlsl.u16 q9, d4, d0[3] + vmlsl.u16 q9, d6, d1[0] + vmlal.u16 q9, d8, d1[1] + vst1.8 {d21}, [U]! + vmlsl.u16 q13, d5, d0[3] + vmlsl.u16 q13, d7, d1[0] + vmlal.u16 q13, d9, d1[1] + vrev64.32 q14, q1 + vrev64.32 q15, q1 + vmlal.u16 q14, d4, d1[1] + vmlsl.u16 q14, d6, d1[2] + vmlsl.u16 q14, d8, d1[3] + vst1.8 {d22}, [V]! + vmlal.u16 q15, d5, d1[1] + vmlsl.u16 q15, d7, d1[2] + vmlsl.u16 q15, d9, d1[3] +.endm + +.balign 16 +jsimd_\colorid\()_ycc_neon_consts: + .short 19595, 38470, 7471, 11059 + .short 21709, 32768, 27439, 5329 + .short 32767, 128, 32767, 128 + .short 32767, 128, 32767, 128 + +asm_function jsimd_\colorid\()_ycc_convert_neon + OUTPUT_WIDTH .req r0 + INPUT_BUF .req r1 + OUTPUT_BUF .req r2 + OUTPUT_ROW .req r3 + NUM_ROWS .req r4 + + OUTPUT_BUF0 .req r5 + OUTPUT_BUF1 .req r6 + OUTPUT_BUF2 .req OUTPUT_BUF + + RGB .req r7 + Y .req r8 + U .req r9 + V .req r10 + N .req ip + + /* Load constants to d0, d1, d2, d3 */ + adr ip, jsimd_\colorid\()_ycc_neon_consts + vld1.16 {d0, d1, d2, d3}, [ip, :128] + + /* Save Arm registers and handle input arguments */ + push {r4, r5, r6, r7, r8, r9, r10, lr} + ldr NUM_ROWS, [sp, #(4 * 8)] + ldr OUTPUT_BUF0, [OUTPUT_BUF] + ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] + ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] + .unreq OUTPUT_BUF + + /* Save Neon registers */ + vpush {d8 - d15} + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + blt 9f +0: + ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] + ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] + mov N, OUTPUT_WIDTH + ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] + add OUTPUT_ROW, OUTPUT_ROW, #1 + ldr RGB, [INPUT_BUF], #4 + + /* Inner loop over pixels */ + subs N, N, #8 + blt 3f + do_load \bpp, 8 + do_rgb_to_yuv_stage1 + subs N, N, #8 + blt 2f +1: + do_rgb_to_yuv_stage2_store_load_stage1 + subs N, N, #8 + bge 1b +2: + do_rgb_to_yuv_stage2 + do_store 8 + tst N, #7 + beq 8f +3: + tst N, #4 + beq 3f + do_load \bpp, 4 +3: + tst N, #2 + beq 4f + do_load \bpp, 2 +4: + tst N, #1 + beq 5f + do_load \bpp, 1 +5: + do_rgb_to_yuv + tst N, #4 + beq 6f + do_store 4 +6: + tst N, #2 + beq 7f + do_store 2 +7: + tst N, #1 + beq 8f + do_store 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + bgt 0b +9: + /* Restore all registers and return */ + vpop {d8 - d15} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + + .unreq OUTPUT_WIDTH + .unreq OUTPUT_ROW + .unreq INPUT_BUF + .unreq NUM_ROWS + .unreq OUTPUT_BUF0 + .unreq OUTPUT_BUF1 + .unreq OUTPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_rgb_to_yuv +.purgem do_rgb_to_yuv_stage1 +.purgem do_rgb_to_yuv_stage2 +.purgem do_rgb_to_yuv_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B */ +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 +generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 +generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 +generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * Load data into workspace, applying unsigned->signed conversion + * + * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get + * rid of VST1.16 instructions + */ + +asm_function jsimd_convsamp_neon + SAMPLE_DATA .req r0 + START_COL .req r1 + WORKSPACE .req r2 + TMP1 .req r3 + TMP2 .req r4 + TMP3 .req r5 + TMP4 .req ip + + push {r4, r5} + vmov.u8 d0, #128 + + ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, START_COL + add TMP2, TMP2, START_COL + add TMP3, TMP3, START_COL + add TMP4, TMP4, START_COL + vld1.8 {d16}, [TMP1] + vsubl.u8 q8, d16, d0 + vld1.8 {d18}, [TMP2] + vsubl.u8 q9, d18, d0 + vld1.8 {d20}, [TMP3] + vsubl.u8 q10, d20, d0 + vld1.8 {d22}, [TMP4] + ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} + vsubl.u8 q11, d22, d0 + vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! + add TMP1, TMP1, START_COL + add TMP2, TMP2, START_COL + vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! + add TMP3, TMP3, START_COL + add TMP4, TMP4, START_COL + vld1.8 {d24}, [TMP1] + vsubl.u8 q12, d24, d0 + vld1.8 {d26}, [TMP2] + vsubl.u8 q13, d26, d0 + vld1.8 {d28}, [TMP3] + vsubl.u8 q14, d28, d0 + vld1.8 {d30}, [TMP4] + vsubl.u8 q15, d30, d0 + vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! + vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! + pop {r4, r5} + bx lr + + .unreq SAMPLE_DATA + .unreq START_COL + .unreq WORKSPACE + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + +/*****************************************************************************/ + +/* + * jsimd_fdct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the forward DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' + * function from jfdctfst.c + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#define XFIX_0_382683433 d0[0] +#define XFIX_0_541196100 d0[1] +#define XFIX_0_707106781 d0[2] +#define XFIX_1_306562965 d0[3] + +.balign 16 +jsimd_fdct_ifast_neon_consts: + .short (98 * 128) /* XFIX_0_382683433 */ + .short (139 * 128) /* XFIX_0_541196100 */ + .short (181 * 128) /* XFIX_0_707106781 */ + .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ + +asm_function jsimd_fdct_ifast_neon + + DATA .req r0 + TMP .req ip + + vpush {d8 - d15} + + /* Load constants */ + adr TMP, jsimd_fdct_ifast_neon_consts + vld1.16 {d0}, [TMP, :64] + + /* Load all DATA into Neon registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | q8 + * 1 | d18 | d19 | q9 + * 2 | d20 | d21 | q10 + * 3 | d22 | d23 | q11 + * 4 | d24 | d25 | q12 + * 5 | d26 | d27 | q13 + * 6 | d28 | d29 | q14 + * 7 | d30 | d31 | q15 + */ + + vld1.16 {d16, d17, d18, d19}, [DATA, :128]! + vld1.16 {d20, d21, d22, d23}, [DATA, :128]! + vld1.16 {d24, d25, d26, d27}, [DATA, :128]! + vld1.16 {d28, d29, d30, d31}, [DATA, :128] + sub DATA, DATA, #(128 - 32) + + mov TMP, #2 +1: + /* Transpose */ + vtrn.16 q12, q13 + vtrn.16 q10, q11 + vtrn.16 q8, q9 + vtrn.16 q14, q15 + vtrn.32 q9, q11 + vtrn.32 q13, q15 + vtrn.32 q8, q10 + vtrn.32 q12, q14 + vswp d30, d23 + vswp d24, d17 + vswp d26, d19 + /* 1-D FDCT */ + vadd.s16 q2, q11, q12 + vswp d28, d21 + vsub.s16 q12, q11, q12 + vsub.s16 q6, q10, q13 + vadd.s16 q10, q10, q13 + vsub.s16 q7, q9, q14 + vadd.s16 q9, q9, q14 + vsub.s16 q1, q8, q15 + vadd.s16 q8, q8, q15 + vsub.s16 q4, q9, q10 + vsub.s16 q5, q8, q2 + vadd.s16 q3, q9, q10 + vadd.s16 q4, q4, q5 + vadd.s16 q2, q8, q2 + vqdmulh.s16 q4, q4, XFIX_0_707106781 + vadd.s16 q11, q12, q6 + vadd.s16 q8, q2, q3 + vsub.s16 q12, q2, q3 + vadd.s16 q3, q6, q7 + vadd.s16 q7, q7, q1 + vqdmulh.s16 q3, q3, XFIX_0_707106781 + vsub.s16 q6, q11, q7 + vadd.s16 q10, q5, q4 + vqdmulh.s16 q6, q6, XFIX_0_382683433 + vsub.s16 q14, q5, q4 + vqdmulh.s16 q11, q11, XFIX_0_541196100 + vqdmulh.s16 q5, q7, XFIX_1_306562965 + vadd.s16 q4, q1, q3 + vsub.s16 q3, q1, q3 + vadd.s16 q7, q7, q6 + vadd.s16 q11, q11, q6 + vadd.s16 q7, q7, q5 + vadd.s16 q13, q3, q11 + vsub.s16 q11, q3, q11 + vadd.s16 q9, q4, q7 + vsub.s16 q15, q4, q7 + subs TMP, TMP, #1 + bne 1b + + /* store results */ + vst1.16 {d16, d17, d18, d19}, [DATA, :128]! + vst1.16 {d20, d21, d22, d23}, [DATA, :128]! + vst1.16 {d24, d25, d26, d27}, [DATA, :128]! + vst1.16 {d28, d29, d30, d31}, [DATA, :128] + + vpop {d8 - d15} + bx lr + + .unreq DATA + .unreq TMP + + +/*****************************************************************************/ + +/* + * GLOBAL(void) + * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors, + * DCTELEM *workspace); + * + * Note: the code uses 2 stage pipelining in order to improve instructions + * scheduling and eliminate stalls (this provides ~15% better + * performance for this function on both Arm Cortex-A8 and + * Arm Cortex-A9 when compared to the non-pipelined variant). + * The instructions which belong to the second stage use different + * indentation for better readiability. + */ +asm_function jsimd_quantize_neon + + COEF_BLOCK .req r0 + DIVISORS .req r1 + WORKSPACE .req r2 + + RECIPROCAL .req DIVISORS + CORRECTION .req r3 + SHIFT .req ip + LOOP_COUNT .req r4 + + vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! + vabs.s16 q12, q0 + add CORRECTION, DIVISORS, #(64 * 2) + add SHIFT, DIVISORS, #(64 * 6) + vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! + vabs.s16 q13, q1 + vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! + vadd.u16 q12, q12, q10 /* add correction */ + vadd.u16 q13, q13, q11 + vmull.u16 q10, d24, d16 /* multiply by reciprocal */ + vmull.u16 q11, d25, d17 + vmull.u16 q8, d26, d18 + vmull.u16 q9, d27, d19 + vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! + vshrn.u32 d20, q10, #16 + vshrn.u32 d21, q11, #16 + vshrn.u32 d22, q8, #16 + vshrn.u32 d23, q9, #16 + vneg.s16 q12, q12 + vneg.s16 q13, q13 + vshr.s16 q2, q0, #15 /* extract sign */ + vshr.s16 q3, q1, #15 + vshl.u16 q14, q10, q12 /* shift */ + vshl.u16 q15, q11, q13 + + push {r4, r5} + mov LOOP_COUNT, #3 +1: + vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! + veor.u16 q14, q14, q2 /* restore sign */ + vabs.s16 q12, q0 + vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! + vabs.s16 q13, q1 + veor.u16 q15, q15, q3 + vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! + vadd.u16 q12, q12, q10 /* add correction */ + vadd.u16 q13, q13, q11 + vmull.u16 q10, d24, d16 /* multiply by reciprocal */ + vmull.u16 q11, d25, d17 + vmull.u16 q8, d26, d18 + vmull.u16 q9, d27, d19 + vsub.u16 q14, q14, q2 + vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! + vsub.u16 q15, q15, q3 + vshrn.u32 d20, q10, #16 + vshrn.u32 d21, q11, #16 + vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! + vshrn.u32 d22, q8, #16 + vshrn.u32 d23, q9, #16 + vneg.s16 q12, q12 + vneg.s16 q13, q13 + vshr.s16 q2, q0, #15 /* extract sign */ + vshr.s16 q3, q1, #15 + vshl.u16 q14, q10, q12 /* shift */ + vshl.u16 q15, q11, q13 + subs LOOP_COUNT, LOOP_COUNT, #1 + bne 1b + pop {r4, r5} + + veor.u16 q14, q14, q2 /* restore sign */ + veor.u16 q15, q15, q3 + vsub.u16 q14, q14, q2 + vsub.u16 q15, q15, q3 + vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! + + bx lr /* return */ + + .unreq COEF_BLOCK + .unreq DIVISORS + .unreq WORKSPACE + .unreq RECIPROCAL + .unreq CORRECTION + .unreq SHIFT + .unreq LOOP_COUNT + + +/*****************************************************************************/ + +/* + * GLOBAL(void) + * jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor, + * JDIMENSION downsampled_width, + * JSAMPARRAY input_data, + * JSAMPARRAY *output_data_ptr); + * + * Note: the use of unaligned writes is the main remaining bottleneck in + * this code, which can be potentially solved to get up to tens + * of percents performance improvement on Cortex-A8/Cortex-A9. + */ + +/* + * Upsample 16 source pixels to 32 destination pixels. The new 16 source + * pixels are loaded to q0. The previous 16 source pixels are in q1. The + * shifted-by-one source pixels are constructed in q2 by using q0 and q1. + * Register d28 is used for multiplication by 3. Register q15 is used + * for adding +1 bias. + */ +.macro upsample16 OUTPTR, INPTR + vld1.8 {q0}, [\INPTR]! + vmovl.u8 q8, d0 + vext.8 q2, q1, q0, #15 + vmovl.u8 q9, d1 + vaddw.u8 q10, q15, d4 + vaddw.u8 q11, q15, d5 + vmlal.u8 q8, d4, d28 + vmlal.u8 q9, d5, d28 + vmlal.u8 q10, d0, d28 + vmlal.u8 q11, d1, d28 + vmov q1, q0 /* backup source pixels to q1 */ + vrshrn.u16 d6, q8, #2 + vrshrn.u16 d7, q9, #2 + vshrn.u16 d8, q10, #2 + vshrn.u16 d9, q11, #2 + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! +.endm + +/* + * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' + * macro, the roles of q0 and q1 registers are reversed for even and odd + * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. + * Also this unrolling allows to reorder loads and stores to compensate + * multiplication latency and reduce stalls. + */ +.macro upsample32 OUTPTR, INPTR + /* even 16 pixels group */ + vld1.8 {q0}, [\INPTR]! + vmovl.u8 q8, d0 + vext.8 q2, q1, q0, #15 + vmovl.u8 q9, d1 + vaddw.u8 q10, q15, d4 + vaddw.u8 q11, q15, d5 + vmlal.u8 q8, d4, d28 + vmlal.u8 q9, d5, d28 + vmlal.u8 q10, d0, d28 + vmlal.u8 q11, d1, d28 + /* odd 16 pixels group */ + vld1.8 {q1}, [\INPTR]! + vrshrn.u16 d6, q8, #2 + vrshrn.u16 d7, q9, #2 + vshrn.u16 d8, q10, #2 + vshrn.u16 d9, q11, #2 + vmovl.u8 q8, d2 + vext.8 q2, q0, q1, #15 + vmovl.u8 q9, d3 + vaddw.u8 q10, q15, d4 + vaddw.u8 q11, q15, d5 + vmlal.u8 q8, d4, d28 + vmlal.u8 q9, d5, d28 + vmlal.u8 q10, d2, d28 + vmlal.u8 q11, d3, d28 + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! + vrshrn.u16 d6, q8, #2 + vrshrn.u16 d7, q9, #2 + vshrn.u16 d8, q10, #2 + vshrn.u16 d9, q11, #2 + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! +.endm + +/* + * Upsample a row of WIDTH pixels from INPTR to OUTPTR. + */ +.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 + /* special case for the first and last pixels */ + sub \WIDTH, \WIDTH, #1 + add \OUTPTR, \OUTPTR, #1 + ldrb \TMP1, [\INPTR, \WIDTH] + strb \TMP1, [\OUTPTR, \WIDTH, asl #1] + ldrb \TMP1, [\INPTR], #1 + strb \TMP1, [\OUTPTR, #-1] + vmov.8 d3[7], \TMP1 + + subs \WIDTH, \WIDTH, #32 + blt 5f +0: /* process 32 pixels per iteration */ + upsample32 \OUTPTR, \INPTR + subs \WIDTH, \WIDTH, #32 + bge 0b +5: + adds \WIDTH, \WIDTH, #16 + blt 1f +0: /* process 16 pixels if needed */ + upsample16 \OUTPTR, \INPTR + subs \WIDTH, \WIDTH, #16 +1: + adds \WIDTH, \WIDTH, #16 + beq 9f + + /* load the remaining 1-15 pixels */ + add \INPTR, \INPTR, \WIDTH + tst \WIDTH, #1 + beq 2f + sub \INPTR, \INPTR, #1 + vld1.8 {d0[0]}, [\INPTR] +2: + tst \WIDTH, #2 + beq 2f + vext.8 d0, d0, d0, #6 + sub \INPTR, \INPTR, #1 + vld1.8 {d0[1]}, [\INPTR] + sub \INPTR, \INPTR, #1 + vld1.8 {d0[0]}, [\INPTR] +2: + tst \WIDTH, #4 + beq 2f + vrev64.32 d0, d0 + sub \INPTR, \INPTR, #1 + vld1.8 {d0[3]}, [\INPTR] + sub \INPTR, \INPTR, #1 + vld1.8 {d0[2]}, [\INPTR] + sub \INPTR, \INPTR, #1 + vld1.8 {d0[1]}, [\INPTR] + sub \INPTR, \INPTR, #1 + vld1.8 {d0[0]}, [\INPTR] +2: + tst \WIDTH, #8 + beq 2f + vmov d1, d0 + sub \INPTR, \INPTR, #8 + vld1.8 {d0}, [\INPTR] +2: /* upsample the remaining pixels */ + vmovl.u8 q8, d0 + vext.8 q2, q1, q0, #15 + vmovl.u8 q9, d1 + vaddw.u8 q10, q15, d4 + vaddw.u8 q11, q15, d5 + vmlal.u8 q8, d4, d28 + vmlal.u8 q9, d5, d28 + vmlal.u8 q10, d0, d28 + vmlal.u8 q11, d1, d28 + vrshrn.u16 d10, q8, #2 + vrshrn.u16 d12, q9, #2 + vshrn.u16 d11, q10, #2 + vshrn.u16 d13, q11, #2 + vzip.8 d10, d11 + vzip.8 d12, d13 + /* store the remaining pixels */ + tst \WIDTH, #8 + beq 2f + vst1.8 {d10, d11}, [\OUTPTR]! + vmov q5, q6 +2: + tst \WIDTH, #4 + beq 2f + vst1.8 {d10}, [\OUTPTR]! + vmov d10, d11 +2: + tst \WIDTH, #2 + beq 2f + vst1.8 {d10[0]}, [\OUTPTR]! + vst1.8 {d10[1]}, [\OUTPTR]! + vst1.8 {d10[2]}, [\OUTPTR]! + vst1.8 {d10[3]}, [\OUTPTR]! + vext.8 d10, d10, d10, #4 +2: + tst \WIDTH, #1 + beq 2f + vst1.8 {d10[0]}, [\OUTPTR]! + vst1.8 {d10[1]}, [\OUTPTR]! +2: +9: +.endm + +asm_function jsimd_h2v1_fancy_upsample_neon + + MAX_V_SAMP_FACTOR .req r0 + DOWNSAMPLED_WIDTH .req r1 + INPUT_DATA .req r2 + OUTPUT_DATA_PTR .req r3 + OUTPUT_DATA .req OUTPUT_DATA_PTR + + OUTPTR .req r4 + INPTR .req r5 + WIDTH .req ip + TMP .req lr + + push {r4, r5, r6, lr} + vpush {d8 - d15} + + ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] + cmp MAX_V_SAMP_FACTOR, #0 + ble 99f + + /* initialize constants */ + vmov.u8 d28, #3 + vmov.u16 q15, #1 +11: + ldr INPTR, [INPUT_DATA], #4 + ldr OUTPTR, [OUTPUT_DATA], #4 + mov WIDTH, DOWNSAMPLED_WIDTH + upsample_row OUTPTR, INPTR, WIDTH, TMP + subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 + bgt 11b + +99: + vpop {d8 - d15} + pop {r4, r5, r6, pc} + + .unreq MAX_V_SAMP_FACTOR + .unreq DOWNSAMPLED_WIDTH + .unreq INPUT_DATA + .unreq OUTPUT_DATA_PTR + .unreq OUTPUT_DATA + + .unreq OUTPTR + .unreq INPTR + .unreq WIDTH + .unreq TMP + +.purgem upsample16 +.purgem upsample32 +.purgem upsample_row + + +/*****************************************************************************/ + +/* + * GLOBAL(JOCTET *) + * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, + * JCOEFPTR block, int last_dc_val, + * c_derived_tbl *dctbl, c_derived_tbl *actbl) + * + */ + +.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP + sub \PUT_BITS, \PUT_BITS, #0x8 + lsr \TMP, \PUT_BUFFER, \PUT_BITS + uxtb \TMP, \TMP + strb \TMP, [\BUFFER, #1]! + cmp \TMP, #0xff + /*it eq*/ + strbeq \ZERO, [\BUFFER, #1]! +.endm + +.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE + /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/ + add \PUT_BITS, \SIZE + /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/ + orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE +.endm + +.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP + cmp \PUT_BITS, #0x10 + blt 15f + eor \ZERO, \ZERO, \ZERO + emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP + emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP +15: +.endm + +.balign 16 +jsimd_huff_encode_one_block_neon_consts: + .byte 0x01 + .byte 0x02 + .byte 0x04 + .byte 0x08 + .byte 0x10 + .byte 0x20 + .byte 0x40 + .byte 0x80 + +asm_function jsimd_huff_encode_one_block_neon + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + add r7, sp, #0x1c + sub r4, sp, #0x40 + bfc r4, #0, #5 + mov sp, r4 /* align sp on 32 bytes */ + vst1.64 {d8, d9, d10, d11}, [r4, :128]! + vst1.64 {d12, d13, d14, d15}, [r4, :128] + sub sp, #0x140 /* reserve 320 bytes */ + str r0, [sp, #0x18] /* working state > sp + Ox18 */ + add r4, sp, #0x20 /* r4 = t1 */ + ldr lr, [r7, #0x8] /* lr = dctbl */ + sub r10, r1, #0x1 /* r10=buffer-- */ + ldrsh r1, [r2] + mov r9, #0x10 + mov r8, #0x1 + adr r5, jsimd_huff_encode_one_block_neon_consts + /* prepare data */ + vld1.8 {d26}, [r5, :64] + veor q8, q8, q8 + veor q9, q9, q9 + vdup.16 q14, r9 + vdup.16 q15, r8 + veor q10, q10, q10 + veor q11, q11, q11 + sub r1, r1, r3 + add r9, r2, #0x22 + add r8, r2, #0x18 + add r3, r2, #0x36 + vmov.16 d0[0], r1 + vld1.16 {d2[0]}, [r9, :16] + vld1.16 {d4[0]}, [r8, :16] + vld1.16 {d6[0]}, [r3, :16] + add r1, r2, #0x2 + add r9, r2, #0x30 + add r8, r2, #0x26 + add r3, r2, #0x28 + vld1.16 {d0[1]}, [r1, :16] + vld1.16 {d2[1]}, [r9, :16] + vld1.16 {d4[1]}, [r8, :16] + vld1.16 {d6[1]}, [r3, :16] + add r1, r2, #0x10 + add r9, r2, #0x40 + add r8, r2, #0x34 + add r3, r2, #0x1a + vld1.16 {d0[2]}, [r1, :16] + vld1.16 {d2[2]}, [r9, :16] + vld1.16 {d4[2]}, [r8, :16] + vld1.16 {d6[2]}, [r3, :16] + add r1, r2, #0x20 + add r9, r2, #0x32 + add r8, r2, #0x42 + add r3, r2, #0xc + vld1.16 {d0[3]}, [r1, :16] + vld1.16 {d2[3]}, [r9, :16] + vld1.16 {d4[3]}, [r8, :16] + vld1.16 {d6[3]}, [r3, :16] + add r1, r2, #0x12 + add r9, r2, #0x24 + add r8, r2, #0x50 + add r3, r2, #0xe + vld1.16 {d1[0]}, [r1, :16] + vld1.16 {d3[0]}, [r9, :16] + vld1.16 {d5[0]}, [r8, :16] + vld1.16 {d7[0]}, [r3, :16] + add r1, r2, #0x4 + add r9, r2, #0x16 + add r8, r2, #0x60 + add r3, r2, #0x1c + vld1.16 {d1[1]}, [r1, :16] + vld1.16 {d3[1]}, [r9, :16] + vld1.16 {d5[1]}, [r8, :16] + vld1.16 {d7[1]}, [r3, :16] + add r1, r2, #0x6 + add r9, r2, #0x8 + add r8, r2, #0x52 + add r3, r2, #0x2a + vld1.16 {d1[2]}, [r1, :16] + vld1.16 {d3[2]}, [r9, :16] + vld1.16 {d5[2]}, [r8, :16] + vld1.16 {d7[2]}, [r3, :16] + add r1, r2, #0x14 + add r9, r2, #0xa + add r8, r2, #0x44 + add r3, r2, #0x38 + vld1.16 {d1[3]}, [r1, :16] + vld1.16 {d3[3]}, [r9, :16] + vld1.16 {d5[3]}, [r8, :16] + vld1.16 {d7[3]}, [r3, :16] + vcgt.s16 q8, q8, q0 + vcgt.s16 q9, q9, q1 + vcgt.s16 q10, q10, q2 + vcgt.s16 q11, q11, q3 + vabs.s16 q0, q0 + vabs.s16 q1, q1 + vabs.s16 q2, q2 + vabs.s16 q3, q3 + veor q8, q8, q0 + veor q9, q9, q1 + veor q10, q10, q2 + veor q11, q11, q3 + add r9, r4, #0x20 + add r8, r4, #0x80 + add r3, r4, #0xa0 + vclz.i16 q0, q0 + vclz.i16 q1, q1 + vclz.i16 q2, q2 + vclz.i16 q3, q3 + vsub.i16 q0, q14, q0 + vsub.i16 q1, q14, q1 + vsub.i16 q2, q14, q2 + vsub.i16 q3, q14, q3 + vst1.16 {d0, d1, d2, d3}, [r4, :256] + vst1.16 {d4, d5, d6, d7}, [r9, :256] + vshl.s16 q0, q15, q0 + vshl.s16 q1, q15, q1 + vshl.s16 q2, q15, q2 + vshl.s16 q3, q15, q3 + vsub.i16 q0, q0, q15 + vsub.i16 q1, q1, q15 + vsub.i16 q2, q2, q15 + vsub.i16 q3, q3, q15 + vand q8, q8, q0 + vand q9, q9, q1 + vand q10, q10, q2 + vand q11, q11, q3 + vst1.16 {d16, d17, d18, d19}, [r8, :256] + vst1.16 {d20, d21, d22, d23}, [r3, :256] + add r1, r2, #0x46 + add r9, r2, #0x3a + add r8, r2, #0x74 + add r3, r2, #0x6a + vld1.16 {d8[0]}, [r1, :16] + vld1.16 {d10[0]}, [r9, :16] + vld1.16 {d12[0]}, [r8, :16] + vld1.16 {d14[0]}, [r3, :16] + veor q8, q8, q8 + veor q9, q9, q9 + veor q10, q10, q10 + veor q11, q11, q11 + add r1, r2, #0x54 + add r9, r2, #0x2c + add r8, r2, #0x76 + add r3, r2, #0x78 + vld1.16 {d8[1]}, [r1, :16] + vld1.16 {d10[1]}, [r9, :16] + vld1.16 {d12[1]}, [r8, :16] + vld1.16 {d14[1]}, [r3, :16] + add r1, r2, #0x62 + add r9, r2, #0x1e + add r8, r2, #0x68 + add r3, r2, #0x7a + vld1.16 {d8[2]}, [r1, :16] + vld1.16 {d10[2]}, [r9, :16] + vld1.16 {d12[2]}, [r8, :16] + vld1.16 {d14[2]}, [r3, :16] + add r1, r2, #0x70 + add r9, r2, #0x2e + add r8, r2, #0x5a + add r3, r2, #0x6c + vld1.16 {d8[3]}, [r1, :16] + vld1.16 {d10[3]}, [r9, :16] + vld1.16 {d12[3]}, [r8, :16] + vld1.16 {d14[3]}, [r3, :16] + add r1, r2, #0x72 + add r9, r2, #0x3c + add r8, r2, #0x4c + add r3, r2, #0x5e + vld1.16 {d9[0]}, [r1, :16] + vld1.16 {d11[0]}, [r9, :16] + vld1.16 {d13[0]}, [r8, :16] + vld1.16 {d15[0]}, [r3, :16] + add r1, r2, #0x64 + add r9, r2, #0x4a + add r8, r2, #0x3e + add r3, r2, #0x6e + vld1.16 {d9[1]}, [r1, :16] + vld1.16 {d11[1]}, [r9, :16] + vld1.16 {d13[1]}, [r8, :16] + vld1.16 {d15[1]}, [r3, :16] + add r1, r2, #0x56 + add r9, r2, #0x58 + add r8, r2, #0x4e + add r3, r2, #0x7c + vld1.16 {d9[2]}, [r1, :16] + vld1.16 {d11[2]}, [r9, :16] + vld1.16 {d13[2]}, [r8, :16] + vld1.16 {d15[2]}, [r3, :16] + add r1, r2, #0x48 + add r9, r2, #0x66 + add r8, r2, #0x5c + add r3, r2, #0x7e + vld1.16 {d9[3]}, [r1, :16] + vld1.16 {d11[3]}, [r9, :16] + vld1.16 {d13[3]}, [r8, :16] + vld1.16 {d15[3]}, [r3, :16] + vcgt.s16 q8, q8, q4 + vcgt.s16 q9, q9, q5 + vcgt.s16 q10, q10, q6 + vcgt.s16 q11, q11, q7 + vabs.s16 q4, q4 + vabs.s16 q5, q5 + vabs.s16 q6, q6 + vabs.s16 q7, q7 + veor q8, q8, q4 + veor q9, q9, q5 + veor q10, q10, q6 + veor q11, q11, q7 + add r1, r4, #0x40 + add r9, r4, #0x60 + add r8, r4, #0xc0 + add r3, r4, #0xe0 + vclz.i16 q4, q4 + vclz.i16 q5, q5 + vclz.i16 q6, q6 + vclz.i16 q7, q7 + vsub.i16 q4, q14, q4 + vsub.i16 q5, q14, q5 + vsub.i16 q6, q14, q6 + vsub.i16 q7, q14, q7 + vst1.16 {d8, d9, d10, d11}, [r1, :256] + vst1.16 {d12, d13, d14, d15}, [r9, :256] + vshl.s16 q4, q15, q4 + vshl.s16 q5, q15, q5 + vshl.s16 q6, q15, q6 + vshl.s16 q7, q15, q7 + vsub.i16 q4, q4, q15 + vsub.i16 q5, q5, q15 + vsub.i16 q6, q6, q15 + vsub.i16 q7, q7, q15 + vand q8, q8, q4 + vand q9, q9, q5 + vand q10, q10, q6 + vand q11, q11, q7 + vst1.16 {d16, d17, d18, d19}, [r8, :256] + vst1.16 {d20, d21, d22, d23}, [r3, :256] + ldr r12, [r7, #0xc] /* r12 = actbl */ + add r1, lr, #0x400 /* r1 = dctbl->ehufsi */ + mov r9, r12 /* r9 = actbl */ + add r6, r4, #0x80 /* r6 = t2 */ + ldr r11, [r0, #0x8] /* r11 = put_buffer */ + ldr r4, [r0, #0xc] /* r4 = put_bits */ + ldrh r2, [r6, #-128] /* r2 = nbits */ + ldrh r3, [r6] /* r3 = temp2 & (((JLONG)1)<<nbits) - 1; */ + ldr r0, [lr, r2, lsl #2] + ldrb r5, [r1, r2] + put_bits r11, r4, r0, r5 + checkbuf15 r10, r11, r4, r5, r0 + put_bits r11, r4, r3, r2 + checkbuf15 r10, r11, r4, r5, r0 + mov lr, r6 /* lr = t2 */ + add r5, r9, #0x400 /* r5 = actbl->ehufsi */ + ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */ + veor q8, q8, q8 + vceq.i16 q0, q0, q8 + vceq.i16 q1, q1, q8 + vceq.i16 q2, q2, q8 + vceq.i16 q3, q3, q8 + vceq.i16 q4, q4, q8 + vceq.i16 q5, q5, q8 + vceq.i16 q6, q6, q8 + vceq.i16 q7, q7, q8 + vmovn.i16 d0, q0 + vmovn.i16 d2, q1 + vmovn.i16 d4, q2 + vmovn.i16 d6, q3 + vmovn.i16 d8, q4 + vmovn.i16 d10, q5 + vmovn.i16 d12, q6 + vmovn.i16 d14, q7 + vand d0, d0, d26 + vand d2, d2, d26 + vand d4, d4, d26 + vand d6, d6, d26 + vand d8, d8, d26 + vand d10, d10, d26 + vand d12, d12, d26 + vand d14, d14, d26 + vpadd.i8 d0, d0, d2 + vpadd.i8 d4, d4, d6 + vpadd.i8 d8, d8, d10 + vpadd.i8 d12, d12, d14 + vpadd.i8 d0, d0, d4 + vpadd.i8 d8, d8, d12 + vpadd.i8 d0, d0, d8 + vmov.32 r1, d0[1] + vmov.32 r8, d0[0] + mvn r1, r1 + mvn r8, r8 + lsrs r1, r1, #0x1 + rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */ + rbit r1, r1 /* r1 = index1 */ + rbit r8, r8 /* r8 = index0 */ + ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */ + str r1, [sp, #0x14] /* index1 > sp + 0x14 */ + cmp r8, #0x0 + beq 6f +1: + clz r2, r8 + add lr, lr, r2, lsl #1 + lsl r8, r8, r2 + ldrh r1, [lr, #-126] +2: + cmp r2, #0x10 + blt 3f + sub r2, r2, #0x10 + put_bits r11, r4, r0, r6 + cmp r4, #0x10 + blt 2b + eor r3, r3, r3 + emit_byte r10, r11, r4, r3, r12 + emit_byte r10, r11, r4, r3, r12 + b 2b +3: + add r2, r1, r2, lsl #4 + ldrh r3, [lr, #2]! + ldr r12, [r9, r2, lsl #2] + ldrb r2, [r5, r2] + put_bits r11, r4, r12, r2 + checkbuf15 r10, r11, r4, r2, r12 + put_bits r11, r4, r3, r1 + checkbuf15 r10, r11, r4, r2, r12 + lsls r8, r8, #0x1 + bne 1b +6: + add r12, sp, #0x20 /* r12 = t1 */ + ldr r8, [sp, #0x14] /* r8 = index1 */ + adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */ + cmp r8, #0x0 + beq 6f + clz r2, r8 + sub r12, r12, lr + lsl r8, r8, r2 + add r2, r2, r12, lsr #1 + add lr, lr, r2, lsl #1 + b 7f +1: + clz r2, r8 + add lr, lr, r2, lsl #1 + lsl r8, r8, r2 +7: + ldrh r1, [lr, #-126] +2: + cmp r2, #0x10 + blt 3f + sub r2, r2, #0x10 + put_bits r11, r4, r0, r6 + cmp r4, #0x10 + blt 2b + eor r3, r3, r3 + emit_byte r10, r11, r4, r3, r12 + emit_byte r10, r11, r4, r3, r12 + b 2b +3: + add r2, r1, r2, lsl #4 + ldrh r3, [lr, #2]! + ldr r12, [r9, r2, lsl #2] + ldrb r2, [r5, r2] + put_bits r11, r4, r12, r2 + checkbuf15 r10, r11, r4, r2, r12 + put_bits r11, r4, r3, r1 + checkbuf15 r10, r11, r4, r2, r12 + lsls r8, r8, #0x1 + bne 1b +6: + add r0, sp, #0x20 + add r0, #0xfe + cmp lr, r0 + bhs 1f + ldr r1, [r9] + ldrb r0, [r5] + put_bits r11, r4, r1, r0 + checkbuf15 r10, r11, r4, r0, r1 +1: + ldr r12, [sp, #0x18] + str r11, [r12, #0x8] + str r4, [r12, #0xc] + add r0, r10, #0x1 + add r4, sp, #0x140 + vld1.64 {d8, d9, d10, d11}, [r4, :128]! + vld1.64 {d12, d13, d14, d15}, [r4, :128] + sub r4, r7, #0x1c + mov sp, r4 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + +.purgem emit_byte +.purgem put_bits +.purgem checkbuf15 diff --git a/media/libjpeg/simd/arm64/jsimd.c b/media/libjpeg/simd/arm64/jsimd.c new file mode 100644 index 0000000000..808c0e3e27 --- /dev/null +++ b/media/libjpeg/simd/arm64/jsimd.c @@ -0,0 +1,798 @@ +/* + * jsimd_arm64.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies). + * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 64-bit Arm architecture. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +#include <stdio.h> +#include <string.h> +#include <ctype.h> + +#define JSIMD_FASTLD3 1 +#define JSIMD_FASTST3 2 +#define JSIMD_FASTTBL 4 + +static unsigned int simd_support = ~0; +static unsigned int simd_huffman = 1; +static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 | + JSIMD_FASTTBL; + +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_cpuinfo(char *buffer, const char *field, char *value) +{ + char *p; + + if (*value == 0) + return 0; + if (strncmp(buffer, field, strlen(field)) != 0) + return 0; + buffer += strlen(field); + while (isspace(*buffer)) + buffer++; + + /* Check if 'value' is present in the buffer as a separate word */ + while ((p = strstr(buffer, value))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(value); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo(int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_cpuinfo(buffer, "CPU part", "0xd03") || + check_cpuinfo(buffer, "CPU part", "0xd07")) + /* The Cortex-A53 has a slow tbl implementation. We can gain a few + percent speedup by disabling the use of that instruction. The + speedup on Cortex-A57 is more subtle but still measurable. */ + simd_features &= ~JSIMD_FASTTBL; + else if (check_cpuinfo(buffer, "CPU part", "0x0a1")) + /* The SIMD version of Huffman encoding is slower than the C version on + Cavium ThunderX. Also, ld3 and st3 are abyssmally slow on that + CPU. */ + simd_huffman = simd_features = 0; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ + +/* + * Armv8 architectures support Neon extensions by default. + * It is no longer optional as it was with Armv7. + */ + + +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char *env = NULL; +#endif +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + + simd_support |= JSIMD_NEON; +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#endif + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCENEON"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_NEON; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; + env = getenv("JSIMD_NOHUFFENC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_huffman = 0; + env = getenv("JSIMD_FASTLD3"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_features |= JSIMD_FASTLD3; + if ((env != NULL) && (strcmp(env, "0") == 0)) + simd_features &= ~JSIMD_FASTLD3; + env = getenv("JSIMD_FASTST3"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_features |= JSIMD_FASTST3; + if ((env != NULL) && (strcmp(env, "0") == 0)) + simd_features &= ~JSIMD_FASTST3; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + if (simd_features & JSIMD_FASTLD3) + neonfct = jsimd_extrgb_ycc_convert_neon; + else + neonfct = jsimd_extrgb_ycc_convert_neon_slowld3; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_extrgbx_ycc_convert_neon; + break; + case JCS_EXT_BGR: + if (simd_features & JSIMD_FASTLD3) + neonfct = jsimd_extbgr_ycc_convert_neon; + else + neonfct = jsimd_extbgr_ycc_convert_neon_slowld3; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_extbgrx_ycc_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_extxbgr_ycc_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_extxrgb_ycc_convert_neon; + break; + default: + if (simd_features & JSIMD_FASTLD3) + neonfct = jsimd_extrgb_ycc_convert_neon; + else + neonfct = jsimd_extrgb_ycc_convert_neon_slowld3; + break; + } + + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + if (simd_features & JSIMD_FASTST3) + neonfct = jsimd_ycc_extrgb_convert_neon; + else + neonfct = jsimd_ycc_extrgb_convert_neon_slowst3; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_ycc_extrgbx_convert_neon; + break; + case JCS_EXT_BGR: + if (simd_features & JSIMD_FASTST3) + neonfct = jsimd_ycc_extbgr_convert_neon; + else + neonfct = jsimd_ycc_extbgr_convert_neon_slowst3; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_ycc_extbgrx_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_ycc_extxbgr_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_ycc_extxrgb_convert_neon; + break; + default: + if (simd_features & JSIMD_FASTST3) + neonfct = jsimd_ycc_extrgb_convert_neon; + else + neonfct = jsimd_ycc_extrgb_convert_neon_slowst3; + break; + } + + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, + output_buf, num_rows); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_neon(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + jsimd_fdct_islow_neon(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + jsimd_fdct_ifast_neon(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + jsimd_quantize_neon(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_NEON && simd_huffman) + return 1; + + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + if (simd_features & JSIMD_FASTTBL) + return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, + dctbl, actbl); + else + return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block, + last_dc_val, dctbl, actbl); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/media/libjpeg/simd/arm64/jsimd_neon.S b/media/libjpeg/simd/arm64/jsimd_neon.S new file mode 100644 index 0000000000..c13d0d37c0 --- /dev/null +++ b/media/libjpeg/simd/arm64/jsimd_neon.S @@ -0,0 +1,3435 @@ +/* + * Armv8 Neon optimizations for libjpeg-turbo + * + * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). + * All Rights Reserved. + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> + * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. + * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> + * Copyright (C) 2014-2016, 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. + * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ +#endif + +#if defined(__APPLE__) +.section __DATA, __const +#elif defined(_WIN32) +.section .rdata +#else +.section .rodata, "a", %progbits +#endif + +/* Constants for jsimd_idct_islow_neon() */ + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +.balign 16 +Ljsimd_idct_islow_neon_consts: + .short F_0_298 + .short -F_0_390 + .short F_0_541 + .short F_0_765 + .short - F_0_899 + .short F_1_175 + .short F_1_501 + .short - F_1_847 + .short - F_1_961 + .short F_2_053 + .short - F_2_562 + .short F_3_072 + .short 0 /* padding */ + .short 0 + .short 0 + .short 0 + +#undef F_0_298 +#undef F_0_390 +#undef F_0_541 +#undef F_0_765 +#undef F_0_899 +#undef F_1_175 +#undef F_1_501 +#undef F_1_847 +#undef F_1_961 +#undef F_2_053 +#undef F_2_562 +#undef F_3_072 + +/* Constants for jsimd_idct_ifast_neon() */ + +.balign 16 +Ljsimd_idct_ifast_neon_consts: + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ + +/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */ + +#define CONST_BITS 13 + +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ + +.balign 16 +Ljsimd_idct_4x4_neon_consts: + .short FIX_1_847759065 /* v0.h[0] */ + .short -FIX_0_765366865 /* v0.h[1] */ + .short -FIX_0_211164243 /* v0.h[2] */ + .short FIX_1_451774981 /* v0.h[3] */ + .short -FIX_2_172734803 /* d1[0] */ + .short FIX_1_061594337 /* d1[1] */ + .short -FIX_0_509795579 /* d1[2] */ + .short -FIX_0_601344887 /* d1[3] */ + .short FIX_0_899976223 /* v2.h[0] */ + .short FIX_2_562915447 /* v2.h[1] */ + .short 1 << (CONST_BITS + 1) /* v2.h[2] */ + .short 0 /* v2.h[3] */ + +.balign 8 +Ljsimd_idct_2x2_neon_consts: + .short -FIX_0_720959822 /* v14[0] */ + .short FIX_0_850430095 /* v14[1] */ + .short -FIX_1_272758580 /* v14[2] */ + .short FIX_3_624509785 /* v14[3] */ + +/* Constants for jsimd_ycc_*_neon() */ + +.balign 16 +Ljsimd_ycc_rgb_neon_consts: + .short 0, 0, 0, 0 + .short 22971, -11277, -23401, 29033 + .short -128, -128, -128, -128 + .short -128, -128, -128, -128 + +/* Constants for jsimd_*_ycc_neon() */ + +.balign 16 +Ljsimd_rgb_ycc_neon_consts: + .short 19595, 38470, 7471, 11059 + .short 21709, 32768, 27439, 5329 + .short 32767, 128, 32767, 128 + .short 32767, 128, 32767, 128 + +/* Constants for jsimd_fdct_islow_neon() */ + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +.balign 16 +Ljsimd_fdct_islow_neon_consts: + .short F_0_298 + .short -F_0_390 + .short F_0_541 + .short F_0_765 + .short - F_0_899 + .short F_1_175 + .short F_1_501 + .short - F_1_847 + .short - F_1_961 + .short F_2_053 + .short - F_2_562 + .short F_3_072 + .short 0 /* padding */ + .short 0 + .short 0 + .short 0 + +#undef F_0_298 +#undef F_0_390 +#undef F_0_541 +#undef F_0_765 +#undef F_0_899 +#undef F_1_175 +#undef F_1_501 +#undef F_1_847 +#undef F_1_961 +#undef F_2_053 +#undef F_2_562 +#undef F_3_072 + +/* Constants for jsimd_fdct_ifast_neon() */ + +.balign 16 +Ljsimd_fdct_ifast_neon_consts: + .short (98 * 128) /* XFIX_0_382683433 */ + .short (139 * 128) /* XFIX_0_541196100 */ + .short (181 * 128) /* XFIX_0_707106781 */ + .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ + +/* Constants for jsimd_h2*_downsample_neon() */ + +.balign 16 +Ljsimd_h2_downsample_neon_consts: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \ + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \ + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \ + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */ + .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \ + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */ + .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \ + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */ + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */ + +/* Constants for jsimd_huff_encode_one_block_neon() */ + +.balign 16 +Ljsimd_huff_encode_one_block_neon_consts: + .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 + .byte 0, 1, 2, 3, 16, 17, 32, 33, \ + 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ + .byte 34, 35, 48, 49, 255, 255, 50, 51, \ + 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ + .byte 8, 9, 22, 23, 36, 37, 50, 51, \ + 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ + .byte 54, 55, 40, 41, 26, 27, 12, 13, \ + 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ + .byte 6, 7, 20, 21, 34, 35, 48, 49, \ + 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ + .byte 42, 43, 28, 29, 14, 15, 30, 31, \ + 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ + .byte 255, 255, 255, 255, 56, 57, 42, 43, \ + 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ + .byte 26, 27, 40, 41, 42, 43, 28, 29, \ + 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ + .byte 255, 255, 255, 255, 0, 1, 255, 255, \ + 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ + .byte 255, 255, 255, 255, 255, 255, 255, 255, \ + 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ + .byte 255, 255, 255, 255, 255, 255, 255, 255, \ + 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ + .byte 4, 5, 6, 7, 255, 255, 255, 255, \ + 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ + +.text + + +#define RESPECT_STRICT_ALIGNMENT 1 + + +/*****************************************************************************/ + +/* Supplementary macro for setting function attributes */ +.macro asm_function fname +#ifdef __APPLE__ + .private_extern _\fname + .globl _\fname +_\fname: +#else + .global \fname +#ifdef __ELF__ + .hidden \fname + .type \fname, %function +#endif +\fname: +#endif +.endm + +/* Get symbol location */ +.macro get_symbol_loc reg, symbol +#ifdef __APPLE__ + adrp \reg, \symbol@PAGE + add \reg, \reg, \symbol@PAGEOFF +#else + adrp \reg, \symbol + add \reg, \reg, :lo12:\symbol +#endif +.endm + +/* Transpose elements of single 128 bit registers */ +.macro transpose_single x0, x1, xi, xilen, literal + ins \xi\xilen[0], \x0\xilen[0] + ins \x1\xilen[0], \x0\xilen[1] + trn1 \x0\literal, \x0\literal, \x1\literal + trn2 \x1\literal, \xi\literal, \x1\literal +.endm + +/* Transpose elements of 2 different registers */ +.macro transpose x0, x1, xi, xilen, literal + mov \xi\xilen, \x0\xilen + trn1 \x0\literal, \x0\literal, \x1\literal + trn2 \x1\literal, \xi\literal, \x1\literal +.endm + +/* Transpose a block of 4x4 coefficients in four 64-bit registers */ +.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen + mov \xi\xilen, \x0\xilen + trn1 \x0\x0len, \x0\x0len, \x2\x2len + trn2 \x2\x2len, \xi\x0len, \x2\x2len + mov \xi\xilen, \x1\xilen + trn1 \x1\x1len, \x1\x1len, \x3\x3len + trn2 \x3\x3len, \xi\x1len, \x3\x3len +.endm + +.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen + mov \xi\xilen, \x0\xilen + trn1 \x0\x0len, \x0\x0len, \x1\x1len + trn2 \x1\x2len, \xi\x0len, \x1\x2len + mov \xi\xilen, \x2\xilen + trn1 \x2\x2len, \x2\x2len, \x3\x3len + trn2 \x3\x2len, \xi\x1len, \x3\x3len +.endm + +.macro transpose_4x4 x0, x1, x2, x3, x5 + transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b + transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b +.endm + +.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3 + trn1 \t0\().8h, \l0\().8h, \l1\().8h + trn1 \t1\().8h, \l2\().8h, \l3\().8h + trn1 \t2\().8h, \l4\().8h, \l5\().8h + trn1 \t3\().8h, \l6\().8h, \l7\().8h + trn2 \l1\().8h, \l0\().8h, \l1\().8h + trn2 \l3\().8h, \l2\().8h, \l3\().8h + trn2 \l5\().8h, \l4\().8h, \l5\().8h + trn2 \l7\().8h, \l6\().8h, \l7\().8h + + trn1 \l4\().4s, \t2\().4s, \t3\().4s + trn2 \t3\().4s, \t2\().4s, \t3\().4s + trn1 \t2\().4s, \t0\().4s, \t1\().4s + trn2 \l2\().4s, \t0\().4s, \t1\().4s + trn1 \t0\().4s, \l1\().4s, \l3\().4s + trn2 \l3\().4s, \l1\().4s, \l3\().4s + trn2 \t1\().4s, \l5\().4s, \l7\().4s + trn1 \l5\().4s, \l5\().4s, \l7\().4s + + trn2 \l6\().2d, \l2\().2d, \t3\().2d + trn1 \l0\().2d, \t2\().2d, \l4\().2d + trn1 \l1\().2d, \t0\().2d, \l5\().2d + trn2 \l7\().2d, \l3\().2d, \t1\().2d + trn1 \l2\().2d, \l2\().2d, \t3\().2d + trn2 \l4\().2d, \t2\().2d, \l4\().2d + trn1 \l3\().2d, \l3\().2d, \t1\().2d + trn2 \l5\().2d, \t0\().2d, \l5\().2d +.endm + + +#define CENTERJSAMPLE 128 + +/*****************************************************************************/ + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + * + * GLOBAL(void) + * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block, + * JSAMPARRAY output_buf, JDIMENSION output_col) + */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define XFIX_P_0_298 v0.h[0] +#define XFIX_N_0_390 v0.h[1] +#define XFIX_P_0_541 v0.h[2] +#define XFIX_P_0_765 v0.h[3] +#define XFIX_N_0_899 v0.h[4] +#define XFIX_P_1_175 v0.h[5] +#define XFIX_P_1_501 v0.h[6] +#define XFIX_N_1_847 v0.h[7] +#define XFIX_N_1_961 v1.h[0] +#define XFIX_P_2_053 v1.h[1] +#define XFIX_N_2_562 v1.h[2] +#define XFIX_P_3_072 v1.h[3] + +asm_function jsimd_idct_islow_neon + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x1 + TMP3 .req x9 + TMP4 .req x10 + TMP5 .req x11 + TMP6 .req x12 + TMP7 .req x13 + TMP8 .req x14 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + sub sp, sp, #64 + get_symbol_loc x15, Ljsimd_idct_islow_neon_consts + mov x10, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32 + ld1 {v0.8h, v1.8h}, [x15] + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 + ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 + ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64 + ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64 + + cmeq v16.8h, v3.8h, #0 + cmeq v26.8h, v4.8h, #0 + cmeq v27.8h, v5.8h, #0 + cmeq v28.8h, v6.8h, #0 + cmeq v29.8h, v7.8h, #0 + cmeq v30.8h, v8.8h, #0 + cmeq v31.8h, v9.8h, #0 + + and v10.16b, v16.16b, v26.16b + and v11.16b, v27.16b, v28.16b + and v12.16b, v29.16b, v30.16b + and v13.16b, v31.16b, v10.16b + and v14.16b, v11.16b, v12.16b + mul v2.8h, v2.8h, v18.8h + and v15.16b, v13.16b, v14.16b + shl v10.8h, v2.8h, #(PASS1_BITS) + sqxtn v16.8b, v15.8h + mov TMP1, v16.d[0] + mvn TMP2, TMP1 + + cbnz TMP2, 2f + /* case all AC coeffs are zeros */ + dup v2.2d, v10.d[0] + dup v6.2d, v10.d[1] + mov v3.16b, v2.16b + mov v7.16b, v6.16b + mov v4.16b, v2.16b + mov v8.16b, v6.16b + mov v5.16b, v2.16b + mov v9.16b, v6.16b +1: + /* for this transpose, we should organise data like this: + * 00, 01, 02, 03, 40, 41, 42, 43 + * 10, 11, 12, 13, 50, 51, 52, 53 + * 20, 21, 22, 23, 60, 61, 62, 63 + * 30, 31, 32, 33, 70, 71, 72, 73 + * 04, 05, 06, 07, 44, 45, 46, 47 + * 14, 15, 16, 17, 54, 55, 56, 57 + * 24, 25, 26, 27, 64, 65, 66, 67 + * 34, 35, 36, 37, 74, 75, 76, 77 + */ + trn1 v28.8h, v2.8h, v3.8h + trn1 v29.8h, v4.8h, v5.8h + trn1 v30.8h, v6.8h, v7.8h + trn1 v31.8h, v8.8h, v9.8h + trn2 v16.8h, v2.8h, v3.8h + trn2 v17.8h, v4.8h, v5.8h + trn2 v18.8h, v6.8h, v7.8h + trn2 v19.8h, v8.8h, v9.8h + trn1 v2.4s, v28.4s, v29.4s + trn1 v6.4s, v30.4s, v31.4s + trn1 v3.4s, v16.4s, v17.4s + trn1 v7.4s, v18.4s, v19.4s + trn2 v4.4s, v28.4s, v29.4s + trn2 v8.4s, v30.4s, v31.4s + trn2 v5.4s, v16.4s, v17.4s + trn2 v9.4s, v18.4s, v19.4s + /* Even part: reverse the even part of the forward DCT. */ + add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v21.16b, v19.16b /* tmp3 = z1 */ + mov v20.16b, v18.16b /* tmp3 = z1 */ + smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ + smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ + sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ + sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ + add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ + sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ + add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ + sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ + add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ + sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ + + smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ + smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ + smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ + smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ + + smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ + smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ + smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ + smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ + + add v23.4s, v23.4s, v27.4s /* z3 += z5 */ + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v25.4s, v25.4s, v27.4s /* z4 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ + add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ + add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ + add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ + add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ + add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ + add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ + add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ + + add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ + add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ + add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ + add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ + add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ + add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ + add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ + add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ + add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ + sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ + sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ + add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ + add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ + sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ + sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ + add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ + add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ + sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ + sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ + add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ + add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ + sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ + sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ + + shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ + shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ + shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ + shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ + movi v0.16b, #(CENTERJSAMPLE) + /* Prepare pointers (dual-issue with Neon instructions) */ + ldp TMP1, TMP2, [OUTPUT_BUF], 16 + sqrshrn v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + ldp TMP3, TMP4, [OUTPUT_BUF], 16 + sqrshrn v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + add TMP1, TMP1, OUTPUT_COL + sqrshrn v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + add TMP2, TMP2, OUTPUT_COL + sqrshrn v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + add TMP3, TMP3, OUTPUT_COL + sqrshrn2 v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + add TMP4, TMP4, OUTPUT_COL + sqrshrn2 v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + ldp TMP5, TMP6, [OUTPUT_BUF], 16 + sqrshrn2 v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + ldp TMP7, TMP8, [OUTPUT_BUF], 16 + sqrshrn2 v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) + add TMP5, TMP5, OUTPUT_COL + add v16.16b, v28.16b, v0.16b + add TMP6, TMP6, OUTPUT_COL + add v18.16b, v29.16b, v0.16b + add TMP7, TMP7, OUTPUT_COL + add v20.16b, v30.16b, v0.16b + add TMP8, TMP8, OUTPUT_COL + add v22.16b, v31.16b, v0.16b + + /* Transpose the final 8-bit samples */ + trn1 v28.16b, v16.16b, v18.16b + trn1 v30.16b, v20.16b, v22.16b + trn2 v29.16b, v16.16b, v18.16b + trn2 v31.16b, v20.16b, v22.16b + + trn1 v16.8h, v28.8h, v30.8h + trn2 v18.8h, v28.8h, v30.8h + trn1 v20.8h, v29.8h, v31.8h + trn2 v22.8h, v29.8h, v31.8h + + uzp1 v28.4s, v16.4s, v18.4s + uzp2 v30.4s, v16.4s, v18.4s + uzp1 v29.4s, v20.4s, v22.4s + uzp2 v31.4s, v20.4s, v22.4s + + /* Store results to the output buffer */ + st1 {v28.d}[0], [TMP1] + st1 {v29.d}[0], [TMP2] + st1 {v28.d}[1], [TMP3] + st1 {v29.d}[1], [TMP4] + st1 {v30.d}[0], [TMP5] + st1 {v31.d}[0], [TMP6] + st1 {v30.d}[1], [TMP7] + st1 {v31.d}[1], [TMP8] + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 + blr x30 + +.balign 16 +2: + mul v3.8h, v3.8h, v19.8h + mul v4.8h, v4.8h, v20.8h + mul v5.8h, v5.8h, v21.8h + add TMP4, xzr, TMP2, LSL #32 + mul v6.8h, v6.8h, v22.8h + mul v7.8h, v7.8h, v23.8h + adds TMP3, xzr, TMP2, LSR #32 + mul v8.8h, v8.8h, v24.8h + mul v9.8h, v9.8h, v25.8h + b.ne 3f + /* Right AC coef is zero */ + dup v15.2d, v10.d[1] + /* Even part: reverse the even part of the forward DCT. */ + add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v20.16b, v18.16b /* tmp3 = z1 */ + sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ + smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ + sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ + add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ + sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */ + + smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ + smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ + smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ + smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ + + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ + add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ + add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ + add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ + + add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ + add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ + add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ + add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ + sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ + add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ + sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ + add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ + sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ + add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ + sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ + + rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + mov v6.16b, v15.16b + mov v7.16b, v15.16b + mov v8.16b, v15.16b + mov v9.16b, v15.16b + b 1b + +.balign 16 +3: + cbnz TMP4, 4f + /* Left AC coef is zero */ + dup v14.2d, v10.d[0] + /* Even part: reverse the even part of the forward DCT. */ + add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v21.16b, v19.16b /* tmp3 = z1 */ + smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ + sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ + sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ + add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ + sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ + + smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ + smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ + smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ + smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ + + add v23.4s, v23.4s, v27.4s /* z3 += z5 */ + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v25.4s, v25.4s, v27.4s /* z4 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ + add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ + add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ + add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ + + add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ + add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ + add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ + add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ + sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ + add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ + sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ + add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ + sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ + add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ + sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ + + mov v2.16b, v14.16b + mov v3.16b, v14.16b + mov v4.16b, v14.16b + mov v5.16b, v14.16b + rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + b 1b + +.balign 16 +4: + /* "No" AC coef is zero */ + /* Even part: reverse the even part of the forward DCT. */ + add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v21.16b, v19.16b /* tmp3 = z1 */ + mov v20.16b, v18.16b /* tmp3 = z1 */ + smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ + smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ + sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ + sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ + add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ + sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ + add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ + sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ + add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ + sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ + + smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ + smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ + smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ + smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ + + smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ + smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ + smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ + smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ + + add v23.4s, v23.4s, v27.4s /* z3 += z5 */ + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v25.4s, v25.4s, v27.4s /* z4 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ + add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ + add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ + add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ + add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ + add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ + add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ + add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ + + add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ + add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ + add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ + add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ + add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ + add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ + add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ + add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ + add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ + sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ + sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ + add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ + add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ + sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ + sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ + add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ + add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ + sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ + sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ + add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ + add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ + sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ + sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ + + rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + b 1b + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + .unreq TMP5 + .unreq TMP6 + .unreq TMP7 + .unreq TMP8 + +#undef CENTERJSAMPLE +#undef CONST_BITS +#undef PASS1_BITS +#undef XFIX_P_0_298 +#undef XFIX_N_0_390 +#undef XFIX_P_0_541 +#undef XFIX_P_0_765 +#undef XFIX_N_0_899 +#undef XFIX_P_1_175 +#undef XFIX_P_1_501 +#undef XFIX_N_1_847 +#undef XFIX_N_1_961 +#undef XFIX_P_2_053 +#undef XFIX_N_2_562 +#undef XFIX_P_3_072 + + +/*****************************************************************************/ + +/* + * jsimd_idct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' + * function from jidctfst.c + * + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. + * But in Arm Neon case some extra additions are required because VQDMULH + * instruction can't handle the constants larger than 1. So the expressions + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", + * which introduces an extra addition. Overall, there are 6 extra additions + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. + */ + +#define XFIX_1_082392200 v0.h[0] +#define XFIX_1_414213562 v0.h[1] +#define XFIX_1_847759065 v0.h[2] +#define XFIX_2_613125930 v0.h[3] + +asm_function jsimd_idct_ifast_neon + + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x1 + TMP3 .req x9 + TMP4 .req x10 + TMP5 .req x11 + TMP6 .req x12 + TMP7 .req x13 + TMP8 .req x14 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + /* Load and dequantize coefficients into Neon registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( v16.8h ) + * 1 | d18 | d19 ( v17.8h ) + * 2 | d20 | d21 ( v18.8h ) + * 3 | d22 | d23 ( v19.8h ) + * 4 | d24 | d25 ( v20.8h ) + * 5 | d26 | d27 ( v21.8h ) + * 6 | d28 | d29 ( v22.8h ) + * 7 | d30 | d31 ( v23.8h ) + */ + /* Save Neon registers used in fast IDCT */ + get_symbol_loc TMP5, Ljsimd_idct_ifast_neon_consts + ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 + ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32 + mul v16.8h, v16.8h, v0.8h + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 + mul v17.8h, v17.8h, v1.8h + ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32 + mul v18.8h, v18.8h, v2.8h + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 + mul v19.8h, v19.8h, v3.8h + ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32 + mul v20.8h, v20.8h, v0.8h + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 + mul v22.8h, v22.8h, v2.8h + mul v21.8h, v21.8h, v1.8h + ld1 {v0.4h}, [TMP5] /* load constants */ + mul v23.8h, v23.8h, v3.8h + + /* 1-D IDCT, pass 1 */ + sub v2.8h, v18.8h, v22.8h + add v22.8h, v18.8h, v22.8h + sub v1.8h, v19.8h, v21.8h + add v21.8h, v19.8h, v21.8h + sub v5.8h, v17.8h, v23.8h + add v23.8h, v17.8h, v23.8h + sqdmulh v4.8h, v2.8h, XFIX_1_414213562 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930 + add v3.8h, v1.8h, v1.8h + sub v1.8h, v5.8h, v1.8h + add v18.8h, v2.8h, v4.8h + sqdmulh v4.8h, v1.8h, XFIX_1_847759065 + sub v2.8h, v23.8h, v21.8h + add v3.8h, v3.8h, v6.8h + sqdmulh v6.8h, v2.8h, XFIX_1_414213562 + add v1.8h, v1.8h, v4.8h + sqdmulh v4.8h, v5.8h, XFIX_1_082392200 + sub v18.8h, v18.8h, v22.8h + add v2.8h, v2.8h, v6.8h + sub v6.8h, v16.8h, v20.8h + add v20.8h, v16.8h, v20.8h + add v17.8h, v5.8h, v4.8h + add v5.8h, v6.8h, v18.8h + sub v18.8h, v6.8h, v18.8h + add v6.8h, v23.8h, v21.8h + add v16.8h, v20.8h, v22.8h + sub v3.8h, v6.8h, v3.8h + sub v20.8h, v20.8h, v22.8h + sub v3.8h, v3.8h, v1.8h + sub v1.8h, v17.8h, v1.8h + add v2.8h, v3.8h, v2.8h + sub v23.8h, v16.8h, v6.8h + add v1.8h, v1.8h, v2.8h + add v16.8h, v16.8h, v6.8h + add v22.8h, v5.8h, v3.8h + sub v17.8h, v5.8h, v3.8h + sub v21.8h, v18.8h, v2.8h + add v18.8h, v18.8h, v2.8h + sub v19.8h, v20.8h, v1.8h + add v20.8h, v20.8h, v1.8h + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31 + /* 1-D IDCT, pass 2 */ + sub v2.8h, v18.8h, v22.8h + add v22.8h, v18.8h, v22.8h + sub v1.8h, v19.8h, v21.8h + add v21.8h, v19.8h, v21.8h + sub v5.8h, v17.8h, v23.8h + add v23.8h, v17.8h, v23.8h + sqdmulh v4.8h, v2.8h, XFIX_1_414213562 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930 + add v3.8h, v1.8h, v1.8h + sub v1.8h, v5.8h, v1.8h + add v18.8h, v2.8h, v4.8h + sqdmulh v4.8h, v1.8h, XFIX_1_847759065 + sub v2.8h, v23.8h, v21.8h + add v3.8h, v3.8h, v6.8h + sqdmulh v6.8h, v2.8h, XFIX_1_414213562 + add v1.8h, v1.8h, v4.8h + sqdmulh v4.8h, v5.8h, XFIX_1_082392200 + sub v18.8h, v18.8h, v22.8h + add v2.8h, v2.8h, v6.8h + sub v6.8h, v16.8h, v20.8h + add v20.8h, v16.8h, v20.8h + add v17.8h, v5.8h, v4.8h + add v5.8h, v6.8h, v18.8h + sub v18.8h, v6.8h, v18.8h + add v6.8h, v23.8h, v21.8h + add v16.8h, v20.8h, v22.8h + sub v3.8h, v6.8h, v3.8h + sub v20.8h, v20.8h, v22.8h + sub v3.8h, v3.8h, v1.8h + sub v1.8h, v17.8h, v1.8h + add v2.8h, v3.8h, v2.8h + sub v23.8h, v16.8h, v6.8h + add v1.8h, v1.8h, v2.8h + add v16.8h, v16.8h, v6.8h + add v22.8h, v5.8h, v3.8h + sub v17.8h, v5.8h, v3.8h + sub v21.8h, v18.8h, v2.8h + add v18.8h, v18.8h, v2.8h + sub v19.8h, v20.8h, v1.8h + add v20.8h, v20.8h, v1.8h + /* Descale to 8-bit and range limit */ + movi v0.16b, #0x80 + /* Prepare pointers (dual-issue with Neon instructions) */ + ldp TMP1, TMP2, [OUTPUT_BUF], 16 + sqshrn v28.8b, v16.8h, #5 + ldp TMP3, TMP4, [OUTPUT_BUF], 16 + sqshrn v29.8b, v17.8h, #5 + add TMP1, TMP1, OUTPUT_COL + sqshrn v30.8b, v18.8h, #5 + add TMP2, TMP2, OUTPUT_COL + sqshrn v31.8b, v19.8h, #5 + add TMP3, TMP3, OUTPUT_COL + sqshrn2 v28.16b, v20.8h, #5 + add TMP4, TMP4, OUTPUT_COL + sqshrn2 v29.16b, v21.8h, #5 + ldp TMP5, TMP6, [OUTPUT_BUF], 16 + sqshrn2 v30.16b, v22.8h, #5 + ldp TMP7, TMP8, [OUTPUT_BUF], 16 + sqshrn2 v31.16b, v23.8h, #5 + add TMP5, TMP5, OUTPUT_COL + add v16.16b, v28.16b, v0.16b + add TMP6, TMP6, OUTPUT_COL + add v18.16b, v29.16b, v0.16b + add TMP7, TMP7, OUTPUT_COL + add v20.16b, v30.16b, v0.16b + add TMP8, TMP8, OUTPUT_COL + add v22.16b, v31.16b, v0.16b + + /* Transpose the final 8-bit samples */ + trn1 v28.16b, v16.16b, v18.16b + trn1 v30.16b, v20.16b, v22.16b + trn2 v29.16b, v16.16b, v18.16b + trn2 v31.16b, v20.16b, v22.16b + + trn1 v16.8h, v28.8h, v30.8h + trn2 v18.8h, v28.8h, v30.8h + trn1 v20.8h, v29.8h, v31.8h + trn2 v22.8h, v29.8h, v31.8h + + uzp1 v28.4s, v16.4s, v18.4s + uzp2 v30.4s, v16.4s, v18.4s + uzp1 v29.4s, v20.4s, v22.4s + uzp2 v31.4s, v20.4s, v22.4s + + /* Store results to the output buffer */ + st1 {v28.d}[0], [TMP1] + st1 {v29.d}[0], [TMP2] + st1 {v28.d}[1], [TMP3] + st1 {v29.d}[1], [TMP4] + st1 {v30.d}[0], [TMP5] + st1 {v31.d}[0], [TMP6] + st1 {v30.d}[1], [TMP7] + st1 {v31.d}[1], [TMP8] + blr x30 + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + .unreq TMP5 + .unreq TMP6 + .unreq TMP7 + .unreq TMP8 + + +/*****************************************************************************/ + +/* + * jsimd_idct_4x4_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular Neon optimized function is + * bit exact compatibility with jpeg-6b. + * + * TODO: a bit better instructions scheduling can be achieved by expanding + * idct_helper/transpose_4x4 macros and reordering instructions, + * but readability will suffer somewhat. + */ + +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 + smull v28.4s, \x4, v2.h[2] + smlal v28.4s, \x8, v0.h[0] + smlal v28.4s, \x14, v0.h[1] + + smull v26.4s, \x16, v1.h[2] + smlal v26.4s, \x12, v1.h[3] + smlal v26.4s, \x10, v2.h[0] + smlal v26.4s, \x6, v2.h[1] + + smull v30.4s, \x4, v2.h[2] + smlsl v30.4s, \x8, v0.h[0] + smlsl v30.4s, \x14, v0.h[1] + + smull v24.4s, \x16, v0.h[2] + smlal v24.4s, \x12, v0.h[3] + smlal v24.4s, \x10, v1.h[0] + smlal v24.4s, \x6, v1.h[1] + + add v20.4s, v28.4s, v26.4s + sub v28.4s, v28.4s, v26.4s + + .if \shift > 16 + srshr v20.4s, v20.4s, #\shift + srshr v28.4s, v28.4s, #\shift + xtn \y26, v20.4s + xtn \y29, v28.4s + .else + rshrn \y26, v20.4s, #\shift + rshrn \y29, v28.4s, #\shift + .endif + + add v20.4s, v30.4s, v24.4s + sub v30.4s, v30.4s, v24.4s + + .if \shift > 16 + srshr v20.4s, v20.4s, #\shift + srshr v30.4s, v30.4s, #\shift + xtn \y27, v20.4s + xtn \y28, v30.4s + .else + rshrn \y27, v20.4s, #\shift + rshrn \y28, v30.4s, #\shift + .endif +.endm + +asm_function jsimd_idct_4x4_neon + + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x1 + TMP3 .req x2 + TMP4 .req x15 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + /* Save all used Neon registers */ + sub sp, sp, 64 + mov x9, sp + /* Load constants (v3.4h is just used for padding) */ + get_symbol_loc TMP4, Ljsimd_idct_4x4_neon_consts + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] + + /* Load all COEF_BLOCK into Neon registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | v4.4h | v5.4h + * 1 | v6.4h | v7.4h + * 2 | v8.4h | v9.4h + * 3 | v10.4h | v11.4h + * 4 | - | - + * 5 | v12.4h | v13.4h + * 6 | v14.4h | v15.4h + * 7 | v16.4h | v17.4h + */ + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 + ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 + add COEF_BLOCK, COEF_BLOCK, #16 + ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 + /* dequantize */ + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 + mul v4.4h, v4.4h, v18.4h + mul v5.4h, v5.4h, v19.4h + ins v4.d[1], v5.d[0] /* 128 bit q4 */ + ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 + mul v6.4h, v6.4h, v20.4h + mul v7.4h, v7.4h, v21.4h + ins v6.d[1], v7.d[0] /* 128 bit q6 */ + mul v8.4h, v8.4h, v22.4h + mul v9.4h, v9.4h, v23.4h + ins v8.d[1], v9.d[0] /* 128 bit q8 */ + add DCT_TABLE, DCT_TABLE, #16 + ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 + mul v10.4h, v10.4h, v24.4h + mul v11.4h, v11.4h, v25.4h + ins v10.d[1], v11.d[0] /* 128 bit q10 */ + mul v12.4h, v12.4h, v26.4h + mul v13.4h, v13.4h, v27.4h + ins v12.d[1], v13.d[0] /* 128 bit q12 */ + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 + mul v14.4h, v14.4h, v28.4h + mul v15.4h, v15.4h, v29.4h + ins v14.d[1], v15.d[0] /* 128 bit q14 */ + mul v16.4h, v16.4h, v30.4h + mul v17.4h, v17.4h, v31.4h + ins v16.d[1], v17.d[0] /* 128 bit q16 */ + + /* Pass 1 */ + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \ + v4.4h, v6.4h, v8.4h, v10.4h + transpose_4x4 v4, v6, v8, v10, v3 + ins v10.d[1], v11.d[0] + idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \ + v5.4h, v7.4h, v9.4h, v11.4h + transpose_4x4 v5, v7, v9, v11, v3 + ins v10.d[1], v11.d[0] + + /* Pass 2 */ + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \ + v26.4h, v27.4h, v28.4h, v29.4h + transpose_4x4 v26, v27, v28, v29, v3 + + /* Range limit */ + movi v30.8h, #0x80 + ins v26.d[1], v27.d[0] + ins v28.d[1], v29.d[0] + add v26.8h, v26.8h, v30.8h + add v28.8h, v28.8h, v30.8h + sqxtun v26.8b, v26.8h + sqxtun v27.8b, v28.8h + + /* Store results to the output buffer */ + ldp TMP1, TMP2, [OUTPUT_BUF], 16 + ldp TMP3, TMP4, [OUTPUT_BUF] + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT + /* We can use much less instructions on little endian systems if the + * OS kernel is not configured to trap unaligned memory accesses + */ + st1 {v26.s}[0], [TMP1], 4 + st1 {v27.s}[0], [TMP3], 4 + st1 {v26.s}[1], [TMP2], 4 + st1 {v27.s}[1], [TMP4], 4 +#else + st1 {v26.b}[0], [TMP1], 1 + st1 {v27.b}[0], [TMP3], 1 + st1 {v26.b}[1], [TMP1], 1 + st1 {v27.b}[1], [TMP3], 1 + st1 {v26.b}[2], [TMP1], 1 + st1 {v27.b}[2], [TMP3], 1 + st1 {v26.b}[3], [TMP1], 1 + st1 {v27.b}[3], [TMP3], 1 + + st1 {v26.b}[4], [TMP2], 1 + st1 {v27.b}[4], [TMP4], 1 + st1 {v26.b}[5], [TMP2], 1 + st1 {v27.b}[5], [TMP4], 1 + st1 {v26.b}[6], [TMP2], 1 + st1 {v27.b}[6], [TMP4], 1 + st1 {v26.b}[7], [TMP2], 1 + st1 {v27.b}[7], [TMP4], 1 +#endif + + /* vpop {v8.4h - v15.4h} (not available) */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + blr x30 + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + +.purgem idct_helper + + +/*****************************************************************************/ + +/* + * jsimd_idct_2x2_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular Neon optimized function is + * bit exact compatibility with jpeg-6b. + */ + +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 + sshll v15.4s, \x4, #15 + smull v26.4s, \x6, v14.h[3] + smlal v26.4s, \x10, v14.h[2] + smlal v26.4s, \x12, v14.h[1] + smlal v26.4s, \x16, v14.h[0] + + add v20.4s, v15.4s, v26.4s + sub v15.4s, v15.4s, v26.4s + + .if \shift > 16 + srshr v20.4s, v20.4s, #\shift + srshr v15.4s, v15.4s, #\shift + xtn \y26, v20.4s + xtn \y27, v15.4s + .else + rshrn \y26, v20.4s, #\shift + rshrn \y27, v15.4s, #\shift + .endif +.endm + +asm_function jsimd_idct_2x2_neon + + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x15 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + /* vpush {v8.4h - v15.4h} (not available) */ + sub sp, sp, 64 + mov x9, sp + + /* Load constants */ + get_symbol_loc TMP2, Ljsimd_idct_2x2_neon_consts + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + ld1 {v14.4h}, [TMP2] + + /* Load all COEF_BLOCK into Neon registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | v4.4h | v5.4h + * 1 | v6.4h | v7.4h + * 2 | - | - + * 3 | v10.4h | v11.4h + * 4 | - | - + * 5 | v12.4h | v13.4h + * 6 | - | - + * 7 | v16.4h | v17.4h + */ + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 + add COEF_BLOCK, COEF_BLOCK, #16 + ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 + add COEF_BLOCK, COEF_BLOCK, #16 + ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 + add COEF_BLOCK, COEF_BLOCK, #16 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 + /* Dequantize */ + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 + mul v4.4h, v4.4h, v18.4h + mul v5.4h, v5.4h, v19.4h + ins v4.d[1], v5.d[0] + mul v6.4h, v6.4h, v20.4h + mul v7.4h, v7.4h, v21.4h + ins v6.d[1], v7.d[0] + add DCT_TABLE, DCT_TABLE, #16 + ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 + mul v10.4h, v10.4h, v24.4h + mul v11.4h, v11.4h, v25.4h + ins v10.d[1], v11.d[0] + add DCT_TABLE, DCT_TABLE, #16 + ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 + mul v12.4h, v12.4h, v26.4h + mul v13.4h, v13.4h, v27.4h + ins v12.d[1], v13.d[0] + add DCT_TABLE, DCT_TABLE, #16 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 + mul v16.4h, v16.4h, v30.4h + mul v17.4h, v17.4h, v31.4h + ins v16.d[1], v17.d[0] + + /* Pass 1 */ +#if 0 + idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h + transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h + idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h + transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h +#else + smull v26.4s, v6.4h, v14.h[3] + smlal v26.4s, v10.4h, v14.h[2] + smlal v26.4s, v12.4h, v14.h[1] + smlal v26.4s, v16.4h, v14.h[0] + smull v24.4s, v7.4h, v14.h[3] + smlal v24.4s, v11.4h, v14.h[2] + smlal v24.4s, v13.4h, v14.h[1] + smlal v24.4s, v17.4h, v14.h[0] + sshll v15.4s, v4.4h, #15 + sshll v30.4s, v5.4h, #15 + add v20.4s, v15.4s, v26.4s + sub v15.4s, v15.4s, v26.4s + rshrn v4.4h, v20.4s, #13 + rshrn v6.4h, v15.4s, #13 + add v20.4s, v30.4s, v24.4s + sub v15.4s, v30.4s, v24.4s + rshrn v5.4h, v20.4s, #13 + rshrn v7.4h, v15.4s, #13 + ins v4.d[1], v5.d[0] + ins v6.d[1], v7.d[0] + transpose v4, v6, v3, .16b, .8h + transpose v6, v10, v3, .16b, .4s + ins v11.d[0], v10.d[1] + ins v7.d[0], v6.d[1] +#endif + + /* Pass 2 */ + idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h + + /* Range limit */ + movi v30.8h, #0x80 + ins v26.d[1], v27.d[0] + add v26.8h, v26.8h, v30.8h + sqxtun v30.8b, v26.8h + ins v26.d[0], v30.d[0] + sqxtun v27.8b, v26.8h + + /* Store results to the output buffer */ + ldp TMP1, TMP2, [OUTPUT_BUF] + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + + st1 {v26.b}[0], [TMP1], 1 + st1 {v27.b}[4], [TMP1], 1 + st1 {v26.b}[1], [TMP2], 1 + st1 {v27.b}[5], [TMP2], 1 + + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + blr x30 + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + +.purgem idct_helper + + +/*****************************************************************************/ + +/* + * jsimd_ycc_extrgb_convert_neon + * jsimd_ycc_extbgr_convert_neon + * jsimd_ycc_extrgbx_convert_neon + * jsimd_ycc_extbgrx_convert_neon + * jsimd_ycc_extxbgr_convert_neon + * jsimd_ycc_extxrgb_convert_neon + * + * Colorspace conversion YCbCr -> RGB + */ + +.macro do_load size + .if \size == 8 + ld1 {v4.8b}, [U], 8 + ld1 {v5.8b}, [V], 8 + ld1 {v0.8b}, [Y], 8 + prfm pldl1keep, [U, #64] + prfm pldl1keep, [V, #64] + prfm pldl1keep, [Y, #64] + .elseif \size == 4 + ld1 {v4.b}[0], [U], 1 + ld1 {v4.b}[1], [U], 1 + ld1 {v4.b}[2], [U], 1 + ld1 {v4.b}[3], [U], 1 + ld1 {v5.b}[0], [V], 1 + ld1 {v5.b}[1], [V], 1 + ld1 {v5.b}[2], [V], 1 + ld1 {v5.b}[3], [V], 1 + ld1 {v0.b}[0], [Y], 1 + ld1 {v0.b}[1], [Y], 1 + ld1 {v0.b}[2], [Y], 1 + ld1 {v0.b}[3], [Y], 1 + .elseif \size == 2 + ld1 {v4.b}[4], [U], 1 + ld1 {v4.b}[5], [U], 1 + ld1 {v5.b}[4], [V], 1 + ld1 {v5.b}[5], [V], 1 + ld1 {v0.b}[4], [Y], 1 + ld1 {v0.b}[5], [Y], 1 + .elseif \size == 1 + ld1 {v4.b}[6], [U], 1 + ld1 {v5.b}[6], [V], 1 + ld1 {v0.b}[6], [Y], 1 + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_store bpp, size, fast_st3 + .if \bpp == 24 + .if \size == 8 + .if \fast_st3 == 1 + st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 + .else + st1 {v10.b}[0], [RGB], #1 + st1 {v11.b}[0], [RGB], #1 + st1 {v12.b}[0], [RGB], #1 + + st1 {v10.b}[1], [RGB], #1 + st1 {v11.b}[1], [RGB], #1 + st1 {v12.b}[1], [RGB], #1 + + st1 {v10.b}[2], [RGB], #1 + st1 {v11.b}[2], [RGB], #1 + st1 {v12.b}[2], [RGB], #1 + + st1 {v10.b}[3], [RGB], #1 + st1 {v11.b}[3], [RGB], #1 + st1 {v12.b}[3], [RGB], #1 + + st1 {v10.b}[4], [RGB], #1 + st1 {v11.b}[4], [RGB], #1 + st1 {v12.b}[4], [RGB], #1 + + st1 {v10.b}[5], [RGB], #1 + st1 {v11.b}[5], [RGB], #1 + st1 {v12.b}[5], [RGB], #1 + + st1 {v10.b}[6], [RGB], #1 + st1 {v11.b}[6], [RGB], #1 + st1 {v12.b}[6], [RGB], #1 + + st1 {v10.b}[7], [RGB], #1 + st1 {v11.b}[7], [RGB], #1 + st1 {v12.b}[7], [RGB], #1 + .endif + .elseif \size == 4 + st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 + .elseif \size == 2 + st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 + .elseif \size == 1 + st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 + .elseif \size == 4 + st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 + .elseif \size == 2 + st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 + .elseif \size == 1 + st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 16 + .if \size == 8 + st1 {v25.8h}, [RGB], 16 + .elseif \size == 4 + st1 {v25.4h}, [RGB], 8 + .elseif \size == 2 + st1 {v25.h}[4], [RGB], 2 + st1 {v25.h}[5], [RGB], 2 + .elseif \size == 1 + st1 {v25.h}[6], [RGB], 2 + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \ + g_offs, gsize, b_offs, bsize, \ + defsize, fast_st3 + +/* + * 2-stage pipelined YCbCr->RGB conversion + */ + +.macro do_yuv_to_rgb_stage1 + uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ + smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ + smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ + smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb_stage2 + rshrn v20.4h, v20.4s, #15 + rshrn2 v20.8h, v22.4s, #15 + rshrn v24.4h, v24.4s, #14 + rshrn2 v24.8h, v26.4s, #14 + rshrn v28.4h, v28.4s, #14 + rshrn2 v28.8h, v30.4s, #14 + uaddw v20.8h, v20.8h, v0.8b + uaddw v24.8h, v24.8h, v0.8b + uaddw v28.8h, v28.8h, v0.8b + .if \bpp != 16 + sqxtun v1\g_offs\defsize, v20.8h + sqxtun v1\r_offs\defsize, v24.8h + sqxtun v1\b_offs\defsize, v28.8h + .else + sqshlu v21.8h, v20.8h, #8 + sqshlu v25.8h, v24.8h, #8 + sqshlu v29.8h, v28.8h, #8 + sri v25.8h, v21.8h, #5 + sri v25.8h, v29.8h, #11 + .endif +.endm + +.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3 + rshrn v20.4h, v20.4s, #15 + rshrn v24.4h, v24.4s, #14 + rshrn v28.4h, v28.4s, #14 + ld1 {v4.8b}, [U], 8 + rshrn2 v20.8h, v22.4s, #15 + rshrn2 v24.8h, v26.4s, #14 + rshrn2 v28.8h, v30.4s, #14 + ld1 {v5.8b}, [V], 8 + uaddw v20.8h, v20.8h, v0.8b + uaddw v24.8h, v24.8h, v0.8b + uaddw v28.8h, v28.8h, v0.8b + .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ + sqxtun v1\g_offs\defsize, v20.8h + ld1 {v0.8b}, [Y], 8 + sqxtun v1\r_offs\defsize, v24.8h + prfm pldl1keep, [U, #64] + prfm pldl1keep, [V, #64] + prfm pldl1keep, [Y, #64] + sqxtun v1\b_offs\defsize, v28.8h + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ + smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ + .else /**************************** rgb565 ********************************/ + sqshlu v21.8h, v20.8h, #8 + sqshlu v25.8h, v24.8h, #8 + sqshlu v29.8h, v28.8h, #8 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + ld1 {v0.8b}, [Y], 8 + smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ + sri v25.8h, v21.8h, #5 + smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ + prfm pldl1keep, [U, #64] + prfm pldl1keep, [V, #64] + prfm pldl1keep, [Y, #64] + sri v25.8h, v29.8h, #11 + .endif + do_store \bpp, 8, \fast_st3 + smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ + smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb + do_yuv_to_rgb_stage1 + do_yuv_to_rgb_stage2 +.endm + +.if \fast_st3 == 1 +asm_function jsimd_ycc_\colorid\()_convert_neon +.else +asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 +.endif + OUTPUT_WIDTH .req w0 + INPUT_BUF .req x1 + INPUT_ROW .req w2 + OUTPUT_BUF .req x3 + NUM_ROWS .req w4 + + INPUT_BUF0 .req x5 + INPUT_BUF1 .req x6 + INPUT_BUF2 .req x1 + + RGB .req x7 + Y .req x9 + U .req x10 + V .req x11 + N .req w15 + + sub sp, sp, 64 + mov x9, sp + + /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ + get_symbol_loc x15, Ljsimd_ycc_rgb_neon_consts + + /* Save Neon registers */ + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + ld1 {v0.4h, v1.4h}, [x15], 16 + ld1 {v2.8h}, [x15] + + ldr INPUT_BUF0, [INPUT_BUF] + ldr INPUT_BUF1, [INPUT_BUF, #8] + ldr INPUT_BUF2, [INPUT_BUF, #16] + .unreq INPUT_BUF + + /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ + movi v10.16b, #255 + movi v13.16b, #255 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + b.lt 9f +0: + ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3] + ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3] + mov N, OUTPUT_WIDTH + ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3] + add INPUT_ROW, INPUT_ROW, #1 + ldr RGB, [OUTPUT_BUF], #8 + + /* Inner loop over pixels */ + subs N, N, #8 + b.lt 3f + do_load 8 + do_yuv_to_rgb_stage1 + subs N, N, #8 + b.lt 2f +1: + do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3 + subs N, N, #8 + b.ge 1b +2: + do_yuv_to_rgb_stage2 + do_store \bpp, 8, \fast_st3 + tst N, #7 + b.eq 8f +3: + tst N, #4 + b.eq 3f + do_load 4 +3: + tst N, #2 + b.eq 4f + do_load 2 +4: + tst N, #1 + b.eq 5f + do_load 1 +5: + do_yuv_to_rgb + tst N, #4 + b.eq 6f + do_store \bpp, 4, \fast_st3 +6: + tst N, #2 + b.eq 7f + do_store \bpp, 2, \fast_st3 +7: + tst N, #1 + b.eq 8f + do_store \bpp, 1, \fast_st3 +8: + subs NUM_ROWS, NUM_ROWS, #1 + b.gt 0b +9: + /* Restore all registers and return */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + br x30 + .unreq OUTPUT_WIDTH + .unreq INPUT_ROW + .unreq OUTPUT_BUF + .unreq NUM_ROWS + .unreq INPUT_BUF0 + .unreq INPUT_BUF1 + .unreq INPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_yuv_to_rgb +.purgem do_yuv_to_rgb_stage1 +.purgem do_yuv_to_rgb_stage2 +.purgem do_yuv_to_rgb_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/ +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1 + +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * jsimd_extrgb_ycc_convert_neon + * jsimd_extbgr_ycc_convert_neon + * jsimd_extrgbx_ycc_convert_neon + * jsimd_extbgrx_ycc_convert_neon + * jsimd_extxbgr_ycc_convert_neon + * jsimd_extxrgb_ycc_convert_neon + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro do_store size + .if \size == 8 + st1 {v20.8b}, [Y], #8 + st1 {v21.8b}, [U], #8 + st1 {v22.8b}, [V], #8 + .elseif \size == 4 + st1 {v20.b}[0], [Y], #1 + st1 {v20.b}[1], [Y], #1 + st1 {v20.b}[2], [Y], #1 + st1 {v20.b}[3], [Y], #1 + st1 {v21.b}[0], [U], #1 + st1 {v21.b}[1], [U], #1 + st1 {v21.b}[2], [U], #1 + st1 {v21.b}[3], [U], #1 + st1 {v22.b}[0], [V], #1 + st1 {v22.b}[1], [V], #1 + st1 {v22.b}[2], [V], #1 + st1 {v22.b}[3], [V], #1 + .elseif \size == 2 + st1 {v20.b}[4], [Y], #1 + st1 {v20.b}[5], [Y], #1 + st1 {v21.b}[4], [U], #1 + st1 {v21.b}[5], [U], #1 + st1 {v22.b}[4], [V], #1 + st1 {v22.b}[5], [V], #1 + .elseif \size == 1 + st1 {v20.b}[6], [Y], #1 + st1 {v21.b}[6], [U], #1 + st1 {v22.b}[6], [V], #1 + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_load bpp, size, fast_ld3 + .if \bpp == 24 + .if \size == 8 + .if \fast_ld3 == 1 + ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24 + .else + ld1 {v10.b}[0], [RGB], #1 + ld1 {v11.b}[0], [RGB], #1 + ld1 {v12.b}[0], [RGB], #1 + + ld1 {v10.b}[1], [RGB], #1 + ld1 {v11.b}[1], [RGB], #1 + ld1 {v12.b}[1], [RGB], #1 + + ld1 {v10.b}[2], [RGB], #1 + ld1 {v11.b}[2], [RGB], #1 + ld1 {v12.b}[2], [RGB], #1 + + ld1 {v10.b}[3], [RGB], #1 + ld1 {v11.b}[3], [RGB], #1 + ld1 {v12.b}[3], [RGB], #1 + + ld1 {v10.b}[4], [RGB], #1 + ld1 {v11.b}[4], [RGB], #1 + ld1 {v12.b}[4], [RGB], #1 + + ld1 {v10.b}[5], [RGB], #1 + ld1 {v11.b}[5], [RGB], #1 + ld1 {v12.b}[5], [RGB], #1 + + ld1 {v10.b}[6], [RGB], #1 + ld1 {v11.b}[6], [RGB], #1 + ld1 {v12.b}[6], [RGB], #1 + + ld1 {v10.b}[7], [RGB], #1 + ld1 {v11.b}[7], [RGB], #1 + ld1 {v12.b}[7], [RGB], #1 + .endif + prfm pldl1keep, [RGB, #128] + .elseif \size == 4 + ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3 + .elseif \size == 2 + ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3 + .elseif \size == 1 + ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3 + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32 + prfm pldl1keep, [RGB, #128] + .elseif \size == 4 + ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4 + .elseif \size == 2 + ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4 + .elseif \size == 1 + ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4 + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \ + b_offs, fast_ld3 + +/* + * 2-stage pipelined RGB->YCbCr conversion + */ + +.macro do_rgb_to_yuv_stage1 + ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */ + ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */ + ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */ + rev64 v18.4s, v1.4s + rev64 v26.4s, v1.4s + rev64 v28.4s, v1.4s + rev64 v30.4s, v1.4s + umull v14.4s, v4.4h, v0.h[0] + umull2 v16.4s, v4.8h, v0.h[0] + umlsl v18.4s, v4.4h, v0.h[3] + umlsl2 v26.4s, v4.8h, v0.h[3] + umlal v28.4s, v4.4h, v0.h[5] + umlal2 v30.4s, v4.8h, v0.h[5] + umlal v14.4s, v6.4h, v0.h[1] + umlal2 v16.4s, v6.8h, v0.h[1] + umlsl v18.4s, v6.4h, v0.h[4] + umlsl2 v26.4s, v6.8h, v0.h[4] + umlsl v28.4s, v6.4h, v0.h[6] + umlsl2 v30.4s, v6.8h, v0.h[6] + umlal v14.4s, v8.4h, v0.h[2] + umlal2 v16.4s, v8.8h, v0.h[2] + umlal v18.4s, v8.4h, v0.h[5] + umlal2 v26.4s, v8.8h, v0.h[5] + umlsl v28.4s, v8.4h, v0.h[7] + umlsl2 v30.4s, v8.8h, v0.h[7] +.endm + +.macro do_rgb_to_yuv_stage2 + rshrn v20.4h, v14.4s, #16 + shrn v22.4h, v18.4s, #16 + shrn v24.4h, v28.4s, #16 + rshrn2 v20.8h, v16.4s, #16 + shrn2 v22.8h, v26.4s, #16 + shrn2 v24.8h, v30.4s, #16 + xtn v20.8b, v20.8h /* v20 = y */ + xtn v21.8b, v22.8h /* v21 = u */ + xtn v22.8b, v24.8h /* v22 = v */ +.endm + +.macro do_rgb_to_yuv + do_rgb_to_yuv_stage1 + do_rgb_to_yuv_stage2 +.endm + +/* TODO: expand macros and interleave instructions if some in-order + * AArch64 processor actually can dual-issue LOAD/STORE with ALU */ +.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3 + do_rgb_to_yuv_stage2 + do_load \bpp, 8, \fast_ld3 + st1 {v20.8b}, [Y], #8 + st1 {v21.8b}, [U], #8 + st1 {v22.8b}, [V], #8 + do_rgb_to_yuv_stage1 +.endm + +.if \fast_ld3 == 1 +asm_function jsimd_\colorid\()_ycc_convert_neon +.else +asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 +.endif + OUTPUT_WIDTH .req w0 + INPUT_BUF .req x1 + OUTPUT_BUF .req x2 + OUTPUT_ROW .req w3 + NUM_ROWS .req w4 + + OUTPUT_BUF0 .req x5 + OUTPUT_BUF1 .req x6 + OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */ + + RGB .req x7 + Y .req x9 + U .req x10 + V .req x11 + N .req w12 + + /* Load constants to d0, d1, d2, d3 */ + get_symbol_loc x13, Ljsimd_rgb_ycc_neon_consts + ld1 {v0.8h, v1.8h}, [x13] + + ldr OUTPUT_BUF0, [OUTPUT_BUF] + ldr OUTPUT_BUF1, [OUTPUT_BUF, #8] + ldr OUTPUT_BUF2, [OUTPUT_BUF, #16] + .unreq OUTPUT_BUF + + /* Save Neon registers */ + sub sp, sp, #64 + mov x9, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + b.lt 9f +0: + ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3] + ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3] + mov N, OUTPUT_WIDTH + ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3] + add OUTPUT_ROW, OUTPUT_ROW, #1 + ldr RGB, [INPUT_BUF], #8 + + /* Inner loop over pixels */ + subs N, N, #8 + b.lt 3f + do_load \bpp, 8, \fast_ld3 + do_rgb_to_yuv_stage1 + subs N, N, #8 + b.lt 2f +1: + do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3 + subs N, N, #8 + b.ge 1b +2: + do_rgb_to_yuv_stage2 + do_store 8 + tst N, #7 + b.eq 8f +3: + tbz N, #2, 3f + do_load \bpp, 4, \fast_ld3 +3: + tbz N, #1, 4f + do_load \bpp, 2, \fast_ld3 +4: + tbz N, #0, 5f + do_load \bpp, 1, \fast_ld3 +5: + do_rgb_to_yuv + tbz N, #2, 6f + do_store 4 +6: + tbz N, #1, 7f + do_store 2 +7: + tbz N, #0, 8f + do_store 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + b.gt 0b +9: + /* Restore all registers and return */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + br x30 + + .unreq OUTPUT_WIDTH + .unreq OUTPUT_ROW + .unreq INPUT_BUF + .unreq NUM_ROWS + .unreq OUTPUT_BUF0 + .unreq OUTPUT_BUF1 + .unreq OUTPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_rgb_to_yuv +.purgem do_rgb_to_yuv_stage1 +.purgem do_rgb_to_yuv_stage2 +.purgem do_rgb_to_yuv_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B Fast LD3 */ +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1 +generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1 +generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1 +generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1 + +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * Load data into workspace, applying unsigned->signed conversion + * + * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get + * rid of VST1.16 instructions + */ + +asm_function jsimd_convsamp_neon + SAMPLE_DATA .req x0 + START_COL .req x1 + WORKSPACE .req x2 + TMP1 .req x9 + TMP2 .req x10 + TMP3 .req x11 + TMP4 .req x12 + TMP5 .req x13 + TMP6 .req x14 + TMP7 .req x15 + TMP8 .req x4 + TMPDUP .req w3 + + /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x1 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x1, w1 + + mov TMPDUP, #128 + ldp TMP1, TMP2, [SAMPLE_DATA], 16 + ldp TMP3, TMP4, [SAMPLE_DATA], 16 + dup v0.8b, TMPDUP + add TMP1, TMP1, START_COL + add TMP2, TMP2, START_COL + ldp TMP5, TMP6, [SAMPLE_DATA], 16 + add TMP3, TMP3, START_COL + add TMP4, TMP4, START_COL + ldp TMP7, TMP8, [SAMPLE_DATA], 16 + add TMP5, TMP5, START_COL + add TMP6, TMP6, START_COL + ld1 {v16.8b}, [TMP1] + add TMP7, TMP7, START_COL + add TMP8, TMP8, START_COL + ld1 {v17.8b}, [TMP2] + usubl v16.8h, v16.8b, v0.8b + ld1 {v18.8b}, [TMP3] + usubl v17.8h, v17.8b, v0.8b + ld1 {v19.8b}, [TMP4] + usubl v18.8h, v18.8b, v0.8b + ld1 {v20.8b}, [TMP5] + usubl v19.8h, v19.8b, v0.8b + ld1 {v21.8b}, [TMP6] + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64 + usubl v20.8h, v20.8b, v0.8b + ld1 {v22.8b}, [TMP7] + usubl v21.8h, v21.8b, v0.8b + ld1 {v23.8b}, [TMP8] + usubl v22.8h, v22.8b, v0.8b + usubl v23.8h, v23.8b, v0.8b + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64 + + br x30 + + .unreq SAMPLE_DATA + .unreq START_COL + .unreq WORKSPACE + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + .unreq TMP5 + .unreq TMP6 + .unreq TMP7 + .unreq TMP8 + .unreq TMPDUP + +/*****************************************************************************/ + +/* + * jsimd_fdct_islow_neon + * + * This file contains a slower but more accurate integer implementation of the + * forward DCT (Discrete Cosine Transform). The following code is based + * directly on the IJG''s original jfdctint.c; see the jfdctint.c for + * more details. + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +#define XFIX_P_0_298 v0.h[0] +#define XFIX_N_0_390 v0.h[1] +#define XFIX_P_0_541 v0.h[2] +#define XFIX_P_0_765 v0.h[3] +#define XFIX_N_0_899 v0.h[4] +#define XFIX_P_1_175 v0.h[5] +#define XFIX_P_1_501 v0.h[6] +#define XFIX_N_1_847 v0.h[7] +#define XFIX_N_1_961 v1.h[0] +#define XFIX_P_2_053 v1.h[1] +#define XFIX_N_2_562 v1.h[2] +#define XFIX_P_3_072 v1.h[3] + +asm_function jsimd_fdct_islow_neon + + DATA .req x0 + TMP .req x9 + + /* Load constants */ + get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts + ld1 {v0.8h, v1.8h}, [TMP] + + /* Save Neon registers */ + sub sp, sp, #64 + mov x10, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32 + + /* Load all DATA into Neon registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | v16.8h + * 1 | d18 | d19 | v17.8h + * 2 | d20 | d21 | v18.8h + * 3 | d22 | d23 | v19.8h + * 4 | d24 | d25 | v20.8h + * 5 | d26 | d27 | v21.8h + * 6 | d28 | d29 | v22.8h + * 7 | d30 | d31 | v23.8h + */ + + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + sub DATA, DATA, #64 + + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 + /* 1-D FDCT */ + add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ + sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ + add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ + sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ + add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ + sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ + add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ + sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ + + /* even part */ + + add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ + sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ + add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ + sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ + + add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ + sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ + + add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ + + shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ + shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ + + smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + mov v22.16b, v18.16b + mov v25.16b, v24.16b + + smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + + rshrn v18.4h, v18.4s, #DESCALE_P1 + rshrn v22.4h, v22.4s, #DESCALE_P1 + rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ + rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ + + /* Odd part */ + + add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ + add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ + add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ + add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ + smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ + smull2 v5.4s, v10.8h, XFIX_P_1_175 + smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ + smlal2 v5.4s, v11.8h, XFIX_P_1_175 + + smull2 v24.4s, v28.8h, XFIX_P_0_298 + smull2 v25.4s, v29.8h, XFIX_P_2_053 + smull2 v26.4s, v30.8h, XFIX_P_3_072 + smull2 v27.4s, v31.8h, XFIX_P_1_501 + smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ + smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ + smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ + smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ + + smull2 v12.4s, v8.8h, XFIX_N_0_899 + smull2 v13.4s, v9.8h, XFIX_N_2_562 + smull2 v14.4s, v10.8h, XFIX_N_1_961 + smull2 v15.4s, v11.8h, XFIX_N_0_390 + smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ + smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ + smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ + smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ + + add v10.4s, v10.4s, v4.4s /* z3 += z5 */ + add v14.4s, v14.4s, v5.4s + add v11.4s, v11.4s, v4.4s /* z4 += z5 */ + add v15.4s, v15.4s, v5.4s + + add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ + add v24.4s, v24.4s, v12.4s + add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ + add v25.4s, v25.4s, v13.4s + add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ + add v26.4s, v26.4s, v14.4s + add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ + add v27.4s, v27.4s, v15.4s + + add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ + add v24.4s, v24.4s, v14.4s + add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ + add v25.4s, v25.4s, v15.4s + add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ + add v26.4s, v26.4s, v13.4s + add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ + add v27.4s, v27.4s, v12.4s + + rshrn v23.4h, v28.4s, #DESCALE_P1 + rshrn v21.4h, v29.4s, #DESCALE_P1 + rshrn v19.4h, v30.4s, #DESCALE_P1 + rshrn v17.4h, v31.4s, #DESCALE_P1 + rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ + rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ + + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 + + /* 1-D FDCT */ + add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ + sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ + add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ + sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ + add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ + sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ + add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ + sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ + + /* even part */ + add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ + sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ + add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ + sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ + + add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ + sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ + + add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ + + srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */ + srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */ + + smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + mov v22.16b, v18.16b + mov v25.16b, v24.16b + + smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + + rshrn v18.4h, v18.4s, #DESCALE_P2 + rshrn v22.4h, v22.4s, #DESCALE_P2 + rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ + rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ + + /* Odd part */ + add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ + add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ + add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ + add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ + + smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ + smull2 v5.4s, v10.8h, XFIX_P_1_175 + smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ + smlal2 v5.4s, v11.8h, XFIX_P_1_175 + + smull2 v24.4s, v28.8h, XFIX_P_0_298 + smull2 v25.4s, v29.8h, XFIX_P_2_053 + smull2 v26.4s, v30.8h, XFIX_P_3_072 + smull2 v27.4s, v31.8h, XFIX_P_1_501 + smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ + smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ + smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ + smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ + + smull2 v12.4s, v8.8h, XFIX_N_0_899 + smull2 v13.4s, v9.8h, XFIX_N_2_562 + smull2 v14.4s, v10.8h, XFIX_N_1_961 + smull2 v15.4s, v11.8h, XFIX_N_0_390 + smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ + smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ + smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ + smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ + + add v10.4s, v10.4s, v4.4s + add v14.4s, v14.4s, v5.4s + add v11.4s, v11.4s, v4.4s + add v15.4s, v15.4s, v5.4s + + add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ + add v24.4s, v24.4s, v12.4s + add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ + add v25.4s, v25.4s, v13.4s + add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ + add v26.4s, v26.4s, v14.4s + add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ + add v27.4s, v27.4s, v15.4s + + add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ + add v24.4s, v24.4s, v14.4s + add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ + add v25.4s, v25.4s, v15.4s + add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ + add v26.4s, v26.4s, v13.4s + add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ + add v27.4s, v27.4s, v12.4s + + rshrn v23.4h, v28.4s, #DESCALE_P2 + rshrn v21.4h, v29.4s, #DESCALE_P2 + rshrn v19.4h, v30.4s, #DESCALE_P2 + rshrn v17.4h, v31.4s, #DESCALE_P2 + rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ + rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ + + /* store results */ + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + + /* Restore Neon registers */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + + br x30 + + .unreq DATA + .unreq TMP + +#undef XFIX_P_0_298 +#undef XFIX_N_0_390 +#undef XFIX_P_0_541 +#undef XFIX_P_0_765 +#undef XFIX_N_0_899 +#undef XFIX_P_1_175 +#undef XFIX_P_1_501 +#undef XFIX_N_1_847 +#undef XFIX_N_1_961 +#undef XFIX_P_2_053 +#undef XFIX_N_2_562 +#undef XFIX_P_3_072 + + +/*****************************************************************************/ + +/* + * jsimd_fdct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the forward DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' + * function from jfdctfst.c + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#undef XFIX_0_541196100 +#define XFIX_0_382683433 v0.h[0] +#define XFIX_0_541196100 v0.h[1] +#define XFIX_0_707106781 v0.h[2] +#define XFIX_1_306562965 v0.h[3] + +asm_function jsimd_fdct_ifast_neon + + DATA .req x0 + TMP .req x9 + + /* Load constants */ + get_symbol_loc TMP, Ljsimd_fdct_ifast_neon_consts + ld1 {v0.4h}, [TMP] + + /* Load all DATA into Neon registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | v0.8h + * 1 | d18 | d19 | q9 + * 2 | d20 | d21 | q10 + * 3 | d22 | d23 | q11 + * 4 | d24 | d25 | q12 + * 5 | d26 | d27 | q13 + * 6 | d28 | d29 | q14 + * 7 | d30 | d31 | q15 + */ + + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + mov TMP, #2 + sub DATA, DATA, #64 +1: + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4 + subs TMP, TMP, #1 + /* 1-D FDCT */ + add v4.8h, v19.8h, v20.8h + sub v20.8h, v19.8h, v20.8h + sub v28.8h, v18.8h, v21.8h + add v18.8h, v18.8h, v21.8h + sub v29.8h, v17.8h, v22.8h + add v17.8h, v17.8h, v22.8h + sub v21.8h, v16.8h, v23.8h + add v16.8h, v16.8h, v23.8h + sub v6.8h, v17.8h, v18.8h + sub v7.8h, v16.8h, v4.8h + add v5.8h, v17.8h, v18.8h + add v6.8h, v6.8h, v7.8h + add v4.8h, v16.8h, v4.8h + sqdmulh v6.8h, v6.8h, XFIX_0_707106781 + add v19.8h, v20.8h, v28.8h + add v16.8h, v4.8h, v5.8h + sub v20.8h, v4.8h, v5.8h + add v5.8h, v28.8h, v29.8h + add v29.8h, v29.8h, v21.8h + sqdmulh v5.8h, v5.8h, XFIX_0_707106781 + sub v28.8h, v19.8h, v29.8h + add v18.8h, v7.8h, v6.8h + sqdmulh v28.8h, v28.8h, XFIX_0_382683433 + sub v22.8h, v7.8h, v6.8h + sqdmulh v19.8h, v19.8h, XFIX_0_541196100 + sqdmulh v7.8h, v29.8h, XFIX_1_306562965 + add v6.8h, v21.8h, v5.8h + sub v5.8h, v21.8h, v5.8h + add v29.8h, v29.8h, v28.8h + add v19.8h, v19.8h, v28.8h + add v29.8h, v29.8h, v7.8h + add v21.8h, v5.8h, v19.8h + sub v19.8h, v5.8h, v19.8h + add v17.8h, v6.8h, v29.8h + sub v23.8h, v6.8h, v29.8h + + b.ne 1b + + /* store results */ + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + + br x30 + + .unreq DATA + .unreq TMP +#undef XFIX_0_382683433 +#undef XFIX_0_541196100 +#undef XFIX_0_707106781 +#undef XFIX_1_306562965 + + +/*****************************************************************************/ + +/* + * GLOBAL(void) + * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors, + * DCTELEM *workspace); + * + */ +asm_function jsimd_quantize_neon + + COEF_BLOCK .req x0 + DIVISORS .req x1 + WORKSPACE .req x2 + + RECIPROCAL .req DIVISORS + CORRECTION .req x9 + SHIFT .req x10 + LOOP_COUNT .req x11 + + mov LOOP_COUNT, #2 + add CORRECTION, DIVISORS, #(64 * 2) + add SHIFT, DIVISORS, #(64 * 6) +1: + subs LOOP_COUNT, LOOP_COUNT, #1 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64 + abs v20.8h, v0.8h + abs v21.8h, v1.8h + abs v22.8h, v2.8h + abs v23.8h, v3.8h + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64 + add v20.8h, v20.8h, v4.8h /* add correction */ + add v21.8h, v21.8h, v5.8h + add v22.8h, v22.8h, v6.8h + add v23.8h, v23.8h, v7.8h + umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */ + umull2 v16.4s, v20.8h, v28.8h + umull v5.4s, v21.4h, v29.4h + umull2 v17.4s, v21.8h, v29.8h + umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */ + umull2 v18.4s, v22.8h, v30.8h + umull v7.4s, v23.4h, v31.4h + umull2 v19.4s, v23.8h, v31.8h + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64 + shrn v4.4h, v4.4s, #16 + shrn v5.4h, v5.4s, #16 + shrn v6.4h, v6.4s, #16 + shrn v7.4h, v7.4s, #16 + shrn2 v4.8h, v16.4s, #16 + shrn2 v5.8h, v17.4s, #16 + shrn2 v6.8h, v18.4s, #16 + shrn2 v7.8h, v19.4s, #16 + neg v24.8h, v24.8h + neg v25.8h, v25.8h + neg v26.8h, v26.8h + neg v27.8h, v27.8h + sshr v0.8h, v0.8h, #15 /* extract sign */ + sshr v1.8h, v1.8h, #15 + sshr v2.8h, v2.8h, #15 + sshr v3.8h, v3.8h, #15 + ushl v4.8h, v4.8h, v24.8h /* shift */ + ushl v5.8h, v5.8h, v25.8h + ushl v6.8h, v6.8h, v26.8h + ushl v7.8h, v7.8h, v27.8h + + eor v4.16b, v4.16b, v0.16b /* restore sign */ + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + sub v4.8h, v4.8h, v0.8h + sub v5.8h, v5.8h, v1.8h + sub v6.8h, v6.8h, v2.8h + sub v7.8h, v7.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64 + + b.ne 1b + + br x30 /* return */ + + .unreq COEF_BLOCK + .unreq DIVISORS + .unreq WORKSPACE + .unreq RECIPROCAL + .unreq CORRECTION + .unreq SHIFT + .unreq LOOP_COUNT + + +/*****************************************************************************/ + +/* + * Downsample pixel values of a single component. + * This version handles the common case of 2:1 horizontal and 1:1 vertical, + * without smoothing. + * + * GLOBAL(void) + * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, + * JDIMENSION v_samp_factor, + * JDIMENSION width_in_blocks, + * JSAMPARRAY input_data, JSAMPARRAY output_data); + */ + +asm_function jsimd_h2v1_downsample_neon + IMAGE_WIDTH .req x0 + MAX_V_SAMP .req x1 + V_SAMP .req x2 + BLOCK_WIDTH .req x3 + INPUT_DATA .req x4 + OUTPUT_DATA .req x5 + OUTPTR .req x9 + INPTR .req x10 + TMP1 .req x11 + TMP2 .req x12 + TMP3 .req x13 + TMPDUP .req w15 + + mov TMPDUP, #0x10000 + lsl TMP2, BLOCK_WIDTH, #4 + sub TMP2, TMP2, IMAGE_WIDTH + get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts + add TMP3, TMP3, TMP2, lsl #4 + dup v16.4s, TMPDUP + ld1 {v18.16b}, [TMP3] + +1: /* row loop */ + ldr INPTR, [INPUT_DATA], #8 + ldr OUTPTR, [OUTPUT_DATA], #8 + subs TMP1, BLOCK_WIDTH, #1 + b.eq 3f +2: /* columns */ + ld1 {v0.16b}, [INPTR], #16 + mov v4.16b, v16.16b + subs TMP1, TMP1, #1 + uadalp v4.8h, v0.16b + shrn v6.8b, v4.8h, #1 + st1 {v6.8b}, [OUTPTR], #8 + b.ne 2b +3: /* last columns */ + ld1 {v0.16b}, [INPTR] + mov v4.16b, v16.16b + subs V_SAMP, V_SAMP, #1 + /* expand right */ + tbl v2.16b, {v0.16b}, v18.16b + uadalp v4.8h, v2.16b + shrn v6.8b, v4.8h, #1 + st1 {v6.8b}, [OUTPTR], #8 + b.ne 1b + + br x30 + + .unreq IMAGE_WIDTH + .unreq MAX_V_SAMP + .unreq V_SAMP + .unreq BLOCK_WIDTH + .unreq INPUT_DATA + .unreq OUTPUT_DATA + .unreq OUTPTR + .unreq INPTR + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMPDUP + + +/*****************************************************************************/ + +/* + * Downsample pixel values of a single component. + * This version handles the common case of 2:1 horizontal and 2:1 vertical, + * without smoothing. + * + * GLOBAL(void) + * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, + * JDIMENSION v_samp_factor, + * JDIMENSION width_in_blocks, + * JSAMPARRAY input_data, JSAMPARRAY output_data); + */ + +.balign 16 +asm_function jsimd_h2v2_downsample_neon + IMAGE_WIDTH .req x0 + MAX_V_SAMP .req x1 + V_SAMP .req x2 + BLOCK_WIDTH .req x3 + INPUT_DATA .req x4 + OUTPUT_DATA .req x5 + OUTPTR .req x9 + INPTR0 .req x10 + INPTR1 .req x14 + TMP1 .req x11 + TMP2 .req x12 + TMP3 .req x13 + TMPDUP .req w15 + + mov TMPDUP, #1 + lsl TMP2, BLOCK_WIDTH, #4 + lsl TMPDUP, TMPDUP, #17 + sub TMP2, TMP2, IMAGE_WIDTH + get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts + orr TMPDUP, TMPDUP, #1 + add TMP3, TMP3, TMP2, lsl #4 + dup v16.4s, TMPDUP + ld1 {v18.16b}, [TMP3] + +1: /* row loop */ + ldr INPTR0, [INPUT_DATA], #8 + ldr OUTPTR, [OUTPUT_DATA], #8 + ldr INPTR1, [INPUT_DATA], #8 + subs TMP1, BLOCK_WIDTH, #1 + b.eq 3f +2: /* columns */ + ld1 {v0.16b}, [INPTR0], #16 + ld1 {v1.16b}, [INPTR1], #16 + mov v4.16b, v16.16b + subs TMP1, TMP1, #1 + uadalp v4.8h, v0.16b + uadalp v4.8h, v1.16b + shrn v6.8b, v4.8h, #2 + st1 {v6.8b}, [OUTPTR], #8 + b.ne 2b +3: /* last columns */ + ld1 {v0.16b}, [INPTR0], #16 + ld1 {v1.16b}, [INPTR1], #16 + mov v4.16b, v16.16b + subs V_SAMP, V_SAMP, #1 + /* expand right */ + tbl v2.16b, {v0.16b}, v18.16b + tbl v3.16b, {v1.16b}, v18.16b + uadalp v4.8h, v2.16b + uadalp v4.8h, v3.16b + shrn v6.8b, v4.8h, #2 + st1 {v6.8b}, [OUTPTR], #8 + b.ne 1b + + br x30 + + .unreq IMAGE_WIDTH + .unreq MAX_V_SAMP + .unreq V_SAMP + .unreq BLOCK_WIDTH + .unreq INPUT_DATA + .unreq OUTPUT_DATA + .unreq OUTPTR + .unreq INPTR0 + .unreq INPTR1 + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMPDUP + + +/*****************************************************************************/ + +/* + * GLOBAL(JOCTET *) + * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, + * JCOEFPTR block, int last_dc_val, + * c_derived_tbl *dctbl, c_derived_tbl *actbl) + * + */ + + BUFFER .req x1 + PUT_BUFFER .req x6 + PUT_BITS .req x7 + PUT_BITSw .req w7 + +.macro emit_byte + sub PUT_BITS, PUT_BITS, #0x8 + lsr x19, PUT_BUFFER, PUT_BITS + uxtb w19, w19 + strb w19, [BUFFER, #1]! + cmp w19, #0xff + b.ne 14f + strb wzr, [BUFFER, #1]! +14: +.endm +.macro put_bits CODE, SIZE + lsl PUT_BUFFER, PUT_BUFFER, \SIZE + add PUT_BITS, PUT_BITS, \SIZE + orr PUT_BUFFER, PUT_BUFFER, \CODE +.endm +.macro checkbuf31 + cmp PUT_BITS, #0x20 + b.lt 31f + emit_byte + emit_byte + emit_byte + emit_byte +31: +.endm +.macro checkbuf47 + cmp PUT_BITS, #0x30 + b.lt 47f + emit_byte + emit_byte + emit_byte + emit_byte + emit_byte + emit_byte +47: +.endm + +.macro generate_jsimd_huff_encode_one_block fast_tbl + +.if \fast_tbl == 1 +asm_function jsimd_huff_encode_one_block_neon +.else +asm_function jsimd_huff_encode_one_block_neon_slowtbl +.endif + sub sp, sp, 272 + sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ + /* Save Arm registers */ + stp x19, x20, [sp] + get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts + ldr PUT_BUFFER, [x0, #0x10] + ldr PUT_BITSw, [x0, #0x18] + ldrsh w12, [x2] /* load DC coeff in w12 */ + /* prepare data */ +.if \fast_tbl == 1 + ld1 {v23.16b}, [x15], #16 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 + sub w12, w12, w3 /* last_dc_val, not used afterwards */ + /* ZigZag 8x8 */ + tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b + tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b + tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b + tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b + ins v0.h[0], w12 + tbx v1.16b, {v28.16b}, v16.16b + tbx v2.16b, {v29.16b, v30.16b}, v17.16b + tbx v5.16b, {v29.16b, v30.16b}, v18.16b + tbx v6.16b, {v31.16b}, v19.16b +.else + add x13, x2, #0x22 + sub w12, w12, w3 /* last_dc_val, not used afterwards */ + ld1 {v23.16b}, [x15] + add x14, x2, #0x18 + add x3, x2, #0x36 + ins v0.h[0], w12 + add x9, x2, #0x2 + ld1 {v1.h}[0], [x13] + add x15, x2, #0x30 + ld1 {v2.h}[0], [x14] + add x19, x2, #0x26 + ld1 {v3.h}[0], [x3] + add x20, x2, #0x28 + ld1 {v0.h}[1], [x9] + add x12, x2, #0x10 + ld1 {v1.h}[1], [x15] + add x13, x2, #0x40 + ld1 {v2.h}[1], [x19] + add x14, x2, #0x34 + ld1 {v3.h}[1], [x20] + add x3, x2, #0x1a + ld1 {v0.h}[2], [x12] + add x9, x2, #0x20 + ld1 {v1.h}[2], [x13] + add x15, x2, #0x32 + ld1 {v2.h}[2], [x14] + add x19, x2, #0x42 + ld1 {v3.h}[2], [x3] + add x20, x2, #0xc + ld1 {v0.h}[3], [x9] + add x12, x2, #0x12 + ld1 {v1.h}[3], [x15] + add x13, x2, #0x24 + ld1 {v2.h}[3], [x19] + add x14, x2, #0x50 + ld1 {v3.h}[3], [x20] + add x3, x2, #0xe + ld1 {v0.h}[4], [x12] + add x9, x2, #0x4 + ld1 {v1.h}[4], [x13] + add x15, x2, #0x16 + ld1 {v2.h}[4], [x14] + add x19, x2, #0x60 + ld1 {v3.h}[4], [x3] + add x20, x2, #0x1c + ld1 {v0.h}[5], [x9] + add x12, x2, #0x6 + ld1 {v1.h}[5], [x15] + add x13, x2, #0x8 + ld1 {v2.h}[5], [x19] + add x14, x2, #0x52 + ld1 {v3.h}[5], [x20] + add x3, x2, #0x2a + ld1 {v0.h}[6], [x12] + add x9, x2, #0x14 + ld1 {v1.h}[6], [x13] + add x15, x2, #0xa + ld1 {v2.h}[6], [x14] + add x19, x2, #0x44 + ld1 {v3.h}[6], [x3] + add x20, x2, #0x38 + ld1 {v0.h}[7], [x9] + add x12, x2, #0x46 + ld1 {v1.h}[7], [x15] + add x13, x2, #0x3a + ld1 {v2.h}[7], [x19] + add x14, x2, #0x74 + ld1 {v3.h}[7], [x20] + add x3, x2, #0x6a + ld1 {v4.h}[0], [x12] + add x9, x2, #0x54 + ld1 {v5.h}[0], [x13] + add x15, x2, #0x2c + ld1 {v6.h}[0], [x14] + add x19, x2, #0x76 + ld1 {v7.h}[0], [x3] + add x20, x2, #0x78 + ld1 {v4.h}[1], [x9] + add x12, x2, #0x62 + ld1 {v5.h}[1], [x15] + add x13, x2, #0x1e + ld1 {v6.h}[1], [x19] + add x14, x2, #0x68 + ld1 {v7.h}[1], [x20] + add x3, x2, #0x7a + ld1 {v4.h}[2], [x12] + add x9, x2, #0x70 + ld1 {v5.h}[2], [x13] + add x15, x2, #0x2e + ld1 {v6.h}[2], [x14] + add x19, x2, #0x5a + ld1 {v7.h}[2], [x3] + add x20, x2, #0x6c + ld1 {v4.h}[3], [x9] + add x12, x2, #0x72 + ld1 {v5.h}[3], [x15] + add x13, x2, #0x3c + ld1 {v6.h}[3], [x19] + add x14, x2, #0x4c + ld1 {v7.h}[3], [x20] + add x3, x2, #0x5e + ld1 {v4.h}[4], [x12] + add x9, x2, #0x64 + ld1 {v5.h}[4], [x13] + add x15, x2, #0x4a + ld1 {v6.h}[4], [x14] + add x19, x2, #0x3e + ld1 {v7.h}[4], [x3] + add x20, x2, #0x6e + ld1 {v4.h}[5], [x9] + add x12, x2, #0x56 + ld1 {v5.h}[5], [x15] + add x13, x2, #0x58 + ld1 {v6.h}[5], [x19] + add x14, x2, #0x4e + ld1 {v7.h}[5], [x20] + add x3, x2, #0x7c + ld1 {v4.h}[6], [x12] + add x9, x2, #0x48 + ld1 {v5.h}[6], [x13] + add x15, x2, #0x66 + ld1 {v6.h}[6], [x14] + add x19, x2, #0x5c + ld1 {v7.h}[6], [x3] + add x20, x2, #0x7e + ld1 {v4.h}[7], [x9] + ld1 {v5.h}[7], [x15] + ld1 {v6.h}[7], [x19] + ld1 {v7.h}[7], [x20] +.endif + cmlt v24.8h, v0.8h, #0 + cmlt v25.8h, v1.8h, #0 + cmlt v26.8h, v2.8h, #0 + cmlt v27.8h, v3.8h, #0 + cmlt v28.8h, v4.8h, #0 + cmlt v29.8h, v5.8h, #0 + cmlt v30.8h, v6.8h, #0 + cmlt v31.8h, v7.8h, #0 + abs v0.8h, v0.8h + abs v1.8h, v1.8h + abs v2.8h, v2.8h + abs v3.8h, v3.8h + abs v4.8h, v4.8h + abs v5.8h, v5.8h + abs v6.8h, v6.8h + abs v7.8h, v7.8h + eor v24.16b, v24.16b, v0.16b + eor v25.16b, v25.16b, v1.16b + eor v26.16b, v26.16b, v2.16b + eor v27.16b, v27.16b, v3.16b + eor v28.16b, v28.16b, v4.16b + eor v29.16b, v29.16b, v5.16b + eor v30.16b, v30.16b, v6.16b + eor v31.16b, v31.16b, v7.16b + cmeq v16.8h, v0.8h, #0 + cmeq v17.8h, v1.8h, #0 + cmeq v18.8h, v2.8h, #0 + cmeq v19.8h, v3.8h, #0 + cmeq v20.8h, v4.8h, #0 + cmeq v21.8h, v5.8h, #0 + cmeq v22.8h, v6.8h, #0 + xtn v16.8b, v16.8h + xtn v18.8b, v18.8h + xtn v20.8b, v20.8h + xtn v22.8b, v22.8h + umov w14, v0.h[0] + xtn2 v16.16b, v17.8h + umov w13, v24.h[0] + xtn2 v18.16b, v19.8h + clz w14, w14 + xtn2 v20.16b, v21.8h + lsl w13, w13, w14 + cmeq v17.8h, v7.8h, #0 + sub w12, w14, #32 + xtn2 v22.16b, v17.8h + lsr w13, w13, w14 + and v16.16b, v16.16b, v23.16b + neg w12, w12 + and v18.16b, v18.16b, v23.16b + add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ + and v20.16b, v20.16b, v23.16b + add x15, sp, #0x90 /* x15 = t2 */ + and v22.16b, v22.16b, v23.16b + ldr w10, [x4, x12, lsl #2] + addp v16.16b, v16.16b, v18.16b + ldrb w11, [x3, x12] + addp v20.16b, v20.16b, v22.16b + checkbuf47 + addp v16.16b, v16.16b, v20.16b + put_bits x10, x11 + addp v16.16b, v16.16b, v18.16b + checkbuf47 + umov x9, v16.D[0] + put_bits x13, x12 + cnt v17.8b, v16.8b + mvn x9, x9 + addv B18, v17.8b + add x4, x5, #0x400 /* x4 = actbl->ehufsi */ + umov w12, v18.b[0] + lsr x9, x9, #0x1 /* clear AC coeff */ + ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ + rbit x9, x9 /* x9 = index0 */ + ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ + cmp w12, #(64-8) + add x11, sp, #16 + b.lt 4f + cbz x9, 6f + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 +1: + clz x2, x9 + add x15, x15, x2, lsl #1 + lsl x9, x9, x2 + ldrh w20, [x15, #-126] +2: + cmp x2, #0x10 + b.lt 3f + sub x2, x2, #0x10 + checkbuf47 + put_bits x13, x14 + b 2b +3: + clz w20, w20 + ldrh w3, [x15, #2]! + sub w11, w20, #32 + lsl w3, w3, w20 + neg w11, w11 + lsr w3, w3, w20 + add x2, x11, x2, lsl #4 + lsl x9, x9, #0x1 + ldr w12, [x5, x2, lsl #2] + ldrb w10, [x4, x2] + checkbuf31 + put_bits x12, x10 + put_bits x3, x11 + cbnz x9, 1b + b 6f +4: + movi v21.8h, #0x0010 + clz v0.8h, v0.8h + clz v1.8h, v1.8h + clz v2.8h, v2.8h + clz v3.8h, v3.8h + clz v4.8h, v4.8h + clz v5.8h, v5.8h + clz v6.8h, v6.8h + clz v7.8h, v7.8h + ushl v24.8h, v24.8h, v0.8h + ushl v25.8h, v25.8h, v1.8h + ushl v26.8h, v26.8h, v2.8h + ushl v27.8h, v27.8h, v3.8h + ushl v28.8h, v28.8h, v4.8h + ushl v29.8h, v29.8h, v5.8h + ushl v30.8h, v30.8h, v6.8h + ushl v31.8h, v31.8h, v7.8h + neg v0.8h, v0.8h + neg v1.8h, v1.8h + neg v2.8h, v2.8h + neg v3.8h, v3.8h + neg v4.8h, v4.8h + neg v5.8h, v5.8h + neg v6.8h, v6.8h + neg v7.8h, v7.8h + ushl v24.8h, v24.8h, v0.8h + ushl v25.8h, v25.8h, v1.8h + ushl v26.8h, v26.8h, v2.8h + ushl v27.8h, v27.8h, v3.8h + ushl v28.8h, v28.8h, v4.8h + ushl v29.8h, v29.8h, v5.8h + ushl v30.8h, v30.8h, v6.8h + ushl v31.8h, v31.8h, v7.8h + add v0.8h, v21.8h, v0.8h + add v1.8h, v21.8h, v1.8h + add v2.8h, v21.8h, v2.8h + add v3.8h, v21.8h, v3.8h + add v4.8h, v21.8h, v4.8h + add v5.8h, v21.8h, v5.8h + add v6.8h, v21.8h, v6.8h + add v7.8h, v21.8h, v7.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 +1: + clz x2, x9 + add x15, x15, x2, lsl #1 + lsl x9, x9, x2 + ldrh w11, [x15, #-126] +2: + cmp x2, #0x10 + b.lt 3f + sub x2, x2, #0x10 + checkbuf47 + put_bits x13, x14 + b 2b +3: + ldrh w3, [x15, #2]! + add x2, x11, x2, lsl #4 + lsl x9, x9, #0x1 + ldr w12, [x5, x2, lsl #2] + ldrb w10, [x4, x2] + checkbuf31 + put_bits x12, x10 + put_bits x3, x11 + cbnz x9, 1b +6: + add x13, sp, #0x10e + cmp x15, x13 + b.hs 1f + ldr w12, [x5] + ldrb w14, [x4] + checkbuf47 + put_bits x12, x14 +1: + str PUT_BUFFER, [x0, #0x10] + str PUT_BITSw, [x0, #0x18] + ldp x19, x20, [sp], 16 + add x0, BUFFER, #0x1 + add sp, sp, 256 + br x30 + +.endm + +generate_jsimd_huff_encode_one_block 1 +generate_jsimd_huff_encode_one_block 0 + + .unreq BUFFER + .unreq PUT_BUFFER + .unreq PUT_BITS + .unreq PUT_BITSw + +.purgem emit_byte +.purgem put_bits +.purgem checkbuf31 +.purgem checkbuf47 diff --git a/media/libjpeg/simd/i386/jccolext-avx2.asm b/media/libjpeg/simd/i386/jccolext-avx2.asm new file mode 100644 index 0000000000..c46d684436 --- /dev/null +++ b/media/libjpeg/simd/i386/jccolext-avx2.asm @@ -0,0 +1,578 @@ +; +; jccolext.asm - colorspace conversion (AVX2) +; +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +%define img_width(b) (b) + 8 ; JDIMENSION img_width +%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf +%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf +%define output_row(b) (b) + 20 ; JDIMENSION output_row +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD + ; ymmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2) + +EXTN(jsimd_rgb_ycc_convert_avx2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_YMMWORD + jae near .columnloop + alignx 16, 7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, byte [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, word [esi+ecx] + shl eax, WORD_BIT + or eax, edx +.column_ld4: + vmovd xmmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + vmovd xmmF, XMM_DWORD [esi+ecx] + vpslldq xmmA, xmmA, SIZEOF_DWORD + vpor xmmA, xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + vmovq xmmB, XMM_MMWORD [esi+ecx] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + sub ecx, byte SIZEOF_XMMWORD + vmovdqu xmmB, XMM_MMWORD [esi+ecx] + vperm2i128 ymmA, ymmA, ymmA, 1 + vpor ymmA, ymmB +.column_ld32: + test cl, SIZEOF_YMMWORD + jz short .column_ld64 + sub ecx, byte SIZEOF_YMMWORD + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] +.column_ld64: + test cl, 2*SIZEOF_YMMWORD + mov ecx, SIZEOF_YMMWORD + jz short .rgb_ycc_cnv + vmovdqa ymmB, ymmA + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 + +.columnloop: + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + vmovdqu ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD] + +.rgb_ycc_cnv: + ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vmovdqu ymmC, ymmA + vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + + vmovdqa ymmG, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 + ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) + vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I + ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A + ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) + vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 + ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) + + vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D + ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) + vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F + ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) + + vmovdqa ymmD, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 + ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) + vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P + ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) + vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B + ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) + + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E + ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F + ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) + + vmovdqa ymmE, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C + ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) + vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S + ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D + ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) + + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F + ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F + ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) + + vpxor ymmH, ymmH, ymmH + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmB, ymmE + vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + + vmovdqa ymmF, ymmD + vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + vmovd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + vmovq xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmF +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + vmovdqa xmmF, xmmA + vperm2i128 ymmF, ymmF, ymmF, 1 + vmovdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] + vpor ymmA, ymmA, ymmF +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + jz short .column_ld16 + sub ecx, byte SIZEOF_XMMWORD/2 + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld16: + test cl, SIZEOF_XMMWORD + mov ecx, SIZEOF_YMMWORD + jz short .rgb_ycc_cnv + vmovdqa ymmE, ymmA + vmovdqa ymmH, ymmF + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 + +.columnloop: + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + vmovdqu ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD] + vmovdqu ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD] + +.rgb_ycc_cnv: + ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmB, ymmA + vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + + vmovdqa ymmB, ymmF + vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmD, ymmA + vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 + ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) + vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 + ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) + + vmovdqa ymmC, ymmF + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D + ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F + ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) + + vmovdqa ymmB, ymmA + vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) + vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D + ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) + + vmovdqa ymmG, ymmD + vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E + ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) + vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F + ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) + + vmovdqa ymmE, ymmA + vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E + ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) + + vmovdqa ymmH, ymmB + vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F + ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F + ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) + + vpxor ymmF, ymmF, ymmF + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmD, ymmB + vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + + vmovdqa ymmG, ymmE + vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) + + vpunpcklbw ymmF, ymmF, ymmH + vpunpckhbw ymmH, ymmH, ymmH + vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE + ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE + vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO + vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE + vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO + + vmovdqa ymm6, ymm1 + vpunpcklwd ymm1, ymm1, ymm3 + vpunpckhwd ymm6, ymm6, ymm3 + vmovdqa ymm7, ymm1 + vmovdqa ymm4, ymm6 + vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) + vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF016_MF033)] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + vpxor ymm1, ymm1, ymm1 + vpxor ymm6, ymm6, ymm6 + vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL + vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH + vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500) + vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500) + + vmovdqa ymm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm5=[PD_ONEHALFM1_CJ] + + vpaddd ymm7, ymm7, ymm1 + vpaddd ymm4, ymm4, ymm6 + vpaddd ymm7, ymm7, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH + vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO + + vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE + + vmovdqa ymm6, ymm0 + vpunpcklwd ymm0, ymm0, ymm2 + vpunpckhwd ymm6, ymm6, ymm2 + vmovdqa ymm5, ymm0 + vmovdqa ymm4, ymm6 + vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) + vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF016_MF033)] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + vpxor ymm0, ymm0, ymm0 + vpxor ymm6, ymm6, ymm6 + vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL + vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH + vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500) + vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500) + + vmovdqa ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm1=[PD_ONEHALFM1_CJ] + + vpaddd ymm5, ymm5, ymm0 + vpaddd ymm4, ymm4, ymm6 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm4, ymm4, ymm1 + vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH + vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE + + vpsllw ymm7, ymm7, BYTE_BIT + vpor ymm5, ymm5, ymm7 ; ymm5=Cb + vmovdqu YMMWORD [ebx], ymm5 ; Save Cb + + vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO + vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE + vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO + + vmovdqa ymm4, ymm0 + vpunpcklwd ymm0, ymm0, ymm3 + vpunpckhwd ymm4, ymm4, ymm3 + vmovdqa ymm7, ymm0 + vmovdqa ymm5, ymm4 + vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) + vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF008_MF041)] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + vmovdqa ymm3, [GOTOFF(eax,PD_ONEHALF)] ; ymm3=[PD_ONEHALF] + + vpaddd ymm0, ymm0, YMMWORD [wk(4)] + vpaddd ymm4, ymm4, YMMWORD [wk(5)] + vpaddd ymm0, ymm0, ymm3 + vpaddd ymm4, ymm4, ymm3 + vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH + vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO + + vpxor ymm3, ymm3, ymm3 + vpxor ymm4, ymm4, ymm4 + vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL + vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH + vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500) + vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500) + + vmovdqa ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm1=[PD_ONEHALFM1_CJ] + + vpaddd ymm7, ymm7, ymm3 + vpaddd ymm5, ymm5, ymm4 + vpaddd ymm7, ymm7, ymm1 + vpaddd ymm5, ymm5, ymm1 + vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL + vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH + vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO + + vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE + + vmovdqa ymm4, ymm6 + vpunpcklwd ymm6, ymm6, ymm2 + vpunpckhwd ymm4, ymm4, ymm2 + vmovdqa ymm1, ymm6 + vmovdqa ymm5, ymm4 + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) + vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_MF008_MF041)] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + vmovdqa ymm2, [GOTOFF(eax,PD_ONEHALF)] ; ymm2=[PD_ONEHALF] + + vpaddd ymm6, ymm6, YMMWORD [wk(6)] + vpaddd ymm4, ymm4, YMMWORD [wk(7)] + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm4, ymm4, ymm2 + vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH + vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE + + vpsllw ymm0, ymm0, BYTE_BIT + vpor ymm6, ymm6, ymm0 ; ymm6=Y + vmovdqu YMMWORD [edi], ymm6 ; Save Y + + vpxor ymm2, ymm2, ymm2 + vpxor ymm4, ymm4, ymm4 + vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL + vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH + vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500) + vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500) + + vmovdqa ymm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm0=[PD_ONEHALFM1_CJ] + + vpaddd ymm1, ymm1, ymm2 + vpaddd ymm5, ymm5, ymm4 + vpaddd ymm1, ymm1, ymm0 + vpaddd ymm5, ymm5, ymm0 + vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL + vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH + vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE + + vpsllw ymm7, ymm7, BYTE_BIT + vpor ymm1, ymm1, ymm7 ; ymm1=Cr + vmovdqu YMMWORD [edx], ymm1 ; Save Cr + + sub ecx, byte SIZEOF_YMMWORD + add esi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr + add edi, byte SIZEOF_YMMWORD ; outptr0 + add ebx, byte SIZEOF_YMMWORD ; outptr1 + add edx, byte SIZEOF_YMMWORD ; outptr2 + cmp ecx, byte SIZEOF_YMMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jccolext-mmx.asm b/media/libjpeg/simd/i386/jccolext-mmx.asm new file mode 100644 index 0000000000..6357a42b2c --- /dev/null +++ b/media/libjpeg/simd/i386/jccolext-mmx.asm @@ -0,0 +1,476 @@ +; +; jccolext.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +%define img_width(b) (b) + 8 ; JDIMENSION img_width +%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf +%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf +%define output_row(b) (b) + 20 ; JDIMENSION output_row +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx) + +EXTN(jsimd_rgb_ycc_convert_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] ; num_cols + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_MMWORD + jae short .columnloop + alignx 16, 7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + xor eax, eax + mov al, byte [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + xor edx, edx + mov dx, word [esi+ecx] + shl eax, WORD_BIT + or eax, edx +.column_ld4: + movd mmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd mmG, dword [esi+ecx] + psllq mmA, DWORD_BIT + por mmA, mmG +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + movq mmG, mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + mov ecx, SIZEOF_MMWORD + jmp short .rgb_ycc_cnv +.column_ld16: + test cl, 2*SIZEOF_MMWORD + mov ecx, SIZEOF_MMWORD + jz short .rgb_ycc_cnv + movq mmF, mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] + +.rgb_ycc_cnv: + ; mmA=(00 10 20 01 11 21 02 12) + ; mmG=(22 03 13 23 04 14 24 05) + ; mmF=(15 25 06 16 26 07 17 27) + + movq mmD, mmA + psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) + psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) + + punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05) + psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) + + punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16) + punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27) + + movq mmE, mmA + psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) + psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) + + punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16) + psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) + + punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07) + punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27) + + pxor mmH, mmH + + movq mmC, mmA + punpcklbw mmA, mmH ; mmA=(00 02 04 06) + punpckhbw mmC, mmH ; mmC=(10 12 14 16) + + movq mmB, mmE + punpcklbw mmE, mmH ; mmE=(20 22 24 26) + punpckhbw mmB, mmH ; mmB=(01 03 05 07) + + movq mmF, mmD + punpcklbw mmD, mmH ; mmD=(11 13 15 17) + punpckhbw mmF, mmH ; mmF=(21 23 25 27) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_MMWORD/8 + jz short .column_ld2 + sub ecx, byte SIZEOF_MMWORD/8 + movd mmA, dword [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_MMWORD/4 + jz short .column_ld4 + sub ecx, byte SIZEOF_MMWORD/4 + movq mmF, mmA + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld4: + test cl, SIZEOF_MMWORD/2 + mov ecx, SIZEOF_MMWORD + jz short .rgb_ycc_cnv + movq mmD, mmA + movq mmC, mmF + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] + +.rgb_ycc_cnv: + ; mmA=(00 10 20 30 01 11 21 31) + ; mmF=(02 12 22 32 03 13 23 33) + ; mmD=(04 14 24 34 05 15 25 35) + ; mmC=(06 16 26 36 07 17 27 37) + + movq mmB, mmA + punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32) + punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33) + + movq mmG, mmD + punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36) + punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37) + + movq mmE, mmA + punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16) + punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36) + + movq mmH, mmB + punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17) + punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37) + + pxor mmF, mmF + + movq mmC, mmA + punpcklbw mmA, mmF ; mmA=(00 02 04 06) + punpckhbw mmC, mmF ; mmC=(10 12 14 16) + + movq mmD, mmB + punpcklbw mmB, mmF ; mmB=(01 03 05 07) + punpckhbw mmD, mmF ; mmD=(11 13 15 17) + + movq mmG, mmE + punpcklbw mmE, mmF ; mmE=(20 22 24 26) + punpckhbw mmG, mmF ; mmG=(30 32 34 36) + + punpcklbw mmF, mmH + punpckhbw mmH, mmH + psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27) + psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movq MMWORD [wk(0)], mm0 ; wk(0)=RE + movq MMWORD [wk(1)], mm1 ; wk(1)=RO + movq MMWORD [wk(2)], mm4 ; wk(2)=BE + movq MMWORD [wk(3)], mm5 ; wk(3)=BO + + movq mm6, mm1 + punpcklwd mm1, mm3 + punpckhwd mm6, mm3 + movq mm7, mm1 + movq mm4, mm6 + pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd mm7, [GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor mm1, mm1 + pxor mm6, mm6 + punpcklwd mm1, mm5 ; mm1=BOL + punpckhwd mm6, mm5 ; mm6=BOH + psrld mm1, 1 ; mm1=BOL*FIX(0.500) + psrld mm6, 1 ; mm6=BOH*FIX(0.500) + + movq mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ] + + paddd mm7, mm1 + paddd mm4, mm6 + paddd mm7, mm5 + paddd mm4, mm5 + psrld mm7, SCALEBITS ; mm7=CbOL + psrld mm4, SCALEBITS ; mm4=CbOH + packssdw mm7, mm4 ; mm7=CbO + + movq mm1, MMWORD [wk(2)] ; mm1=BE + + movq mm6, mm0 + punpcklwd mm0, mm2 + punpckhwd mm6, mm2 + movq mm5, mm0 + movq mm4, mm6 + pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd mm5, [GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor mm0, mm0 + pxor mm6, mm6 + punpcklwd mm0, mm1 ; mm0=BEL + punpckhwd mm6, mm1 ; mm6=BEH + psrld mm0, 1 ; mm0=BEL*FIX(0.500) + psrld mm6, 1 ; mm6=BEH*FIX(0.500) + + movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] + + paddd mm5, mm0 + paddd mm4, mm6 + paddd mm5, mm1 + paddd mm4, mm1 + psrld mm5, SCALEBITS ; mm5=CbEL + psrld mm4, SCALEBITS ; mm4=CbEH + packssdw mm5, mm4 ; mm5=CbE + + psllw mm7, BYTE_BIT + por mm5, mm7 ; mm5=Cb + movq MMWORD [ebx], mm5 ; Save Cb + + movq mm0, MMWORD [wk(3)] ; mm0=BO + movq mm6, MMWORD [wk(2)] ; mm6=BE + movq mm1, MMWORD [wk(1)] ; mm1=RO + + movq mm4, mm0 + punpcklwd mm0, mm3 + punpckhwd mm4, mm3 + movq mm7, mm0 + movq mm5, mm4 + pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd mm7, [GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] + + paddd mm0, MMWORD [wk(4)] + paddd mm4, MMWORD [wk(5)] + paddd mm0, mm3 + paddd mm4, mm3 + psrld mm0, SCALEBITS ; mm0=YOL + psrld mm4, SCALEBITS ; mm4=YOH + packssdw mm0, mm4 ; mm0=YO + + pxor mm3, mm3 + pxor mm4, mm4 + punpcklwd mm3, mm1 ; mm3=ROL + punpckhwd mm4, mm1 ; mm4=ROH + psrld mm3, 1 ; mm3=ROL*FIX(0.500) + psrld mm4, 1 ; mm4=ROH*FIX(0.500) + + movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] + + paddd mm7, mm3 + paddd mm5, mm4 + paddd mm7, mm1 + paddd mm5, mm1 + psrld mm7, SCALEBITS ; mm7=CrOL + psrld mm5, SCALEBITS ; mm5=CrOH + packssdw mm7, mm5 ; mm7=CrO + + movq mm3, MMWORD [wk(0)] ; mm3=RE + + movq mm4, mm6 + punpcklwd mm6, mm2 + punpckhwd mm4, mm2 + movq mm1, mm6 + movq mm5, mm4 + pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd mm1, [GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] + + paddd mm6, MMWORD [wk(6)] + paddd mm4, MMWORD [wk(7)] + paddd mm6, mm2 + paddd mm4, mm2 + psrld mm6, SCALEBITS ; mm6=YEL + psrld mm4, SCALEBITS ; mm4=YEH + packssdw mm6, mm4 ; mm6=YE + + psllw mm0, BYTE_BIT + por mm6, mm0 ; mm6=Y + movq MMWORD [edi], mm6 ; Save Y + + pxor mm2, mm2 + pxor mm4, mm4 + punpcklwd mm2, mm3 ; mm2=REL + punpckhwd mm4, mm3 ; mm4=REH + psrld mm2, 1 ; mm2=REL*FIX(0.500) + psrld mm4, 1 ; mm4=REH*FIX(0.500) + + movq mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ] + + paddd mm1, mm2 + paddd mm5, mm4 + paddd mm1, mm0 + paddd mm5, mm0 + psrld mm1, SCALEBITS ; mm1=CrEL + psrld mm5, SCALEBITS ; mm5=CrEH + packssdw mm1, mm5 ; mm1=CrE + + psllw mm7, BYTE_BIT + por mm1, mm7 ; mm1=Cr + movq MMWORD [edx], mm1 ; Save Cr + + sub ecx, byte SIZEOF_MMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr + add edi, byte SIZEOF_MMWORD ; outptr0 + add ebx, byte SIZEOF_MMWORD ; outptr1 + add edx, byte SIZEOF_MMWORD ; outptr2 + cmp ecx, byte SIZEOF_MMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jccolext-sse2.asm b/media/libjpeg/simd/i386/jccolext-sse2.asm new file mode 100644 index 0000000000..c6c80852ac --- /dev/null +++ b/media/libjpeg/simd/i386/jccolext-sse2.asm @@ -0,0 +1,503 @@ +; +; jccolext.asm - colorspace conversion (SSE2) +; +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +%define img_width(b) (b) + 8 ; JDIMENSION img_width +%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf +%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf +%define output_row(b) (b) + 20 ; JDIMENSION output_row +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2) + +EXTN(jsimd_rgb_ycc_convert_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16, 7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, byte [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, word [esi+ecx] + shl eax, WORD_BIT + or eax, edx +.column_ld4: + movd xmmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF, xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB, xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH, xmmH + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB, xmmE + punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF, xmmD + punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE, xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF, xmmA + movdqa xmmH, xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC, xmmF + punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB, xmmA + punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG, xmmD + punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE, xmmA + punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH, xmmB + punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF, xmmF + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD, xmmB + punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG, xmmE + punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF, xmmH + punpckhbw xmmH, xmmH + psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm6, xmm3 + movdqa xmm7, xmm1 + movdqa xmm4, xmm6 + pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7, [GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1, xmm1 + pxor xmm6, xmm6 + punpcklwd xmm1, xmm5 ; xmm1=BOL + punpckhwd xmm6, xmm5 ; xmm6=BOH + psrld xmm1, 1 ; xmm1=BOL*FIX(0.500) + psrld xmm6, 1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7, xmm1 + paddd xmm4, xmm6 + paddd xmm7, xmm5 + paddd xmm4, xmm5 + psrld xmm7, SCALEBITS ; xmm7=CbOL + psrld xmm4, SCALEBITS ; xmm4=CbOH + packssdw xmm7, xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm6, xmm2 + movdqa xmm5, xmm0 + movdqa xmm4, xmm6 + pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5, [GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0, xmm0 + pxor xmm6, xmm6 + punpcklwd xmm0, xmm1 ; xmm0=BEL + punpckhwd xmm6, xmm1 ; xmm6=BEH + psrld xmm0, 1 ; xmm0=BEL*FIX(0.500) + psrld xmm6, 1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5, xmm0 + paddd xmm4, xmm6 + paddd xmm5, xmm1 + paddd xmm4, xmm1 + psrld xmm5, SCALEBITS ; xmm5=CbEL + psrld xmm4, SCALEBITS ; xmm4=CbEH + packssdw xmm5, xmm4 ; xmm5=CbE + + psllw xmm7, BYTE_BIT + por xmm5, xmm7 ; xmm5=Cb + movdqa XMMWORD [ebx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 + punpckhwd xmm4, xmm3 + movdqa xmm7, xmm0 + movdqa xmm5, xmm4 + pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7, [GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0, xmm3 + paddd xmm4, xmm3 + psrld xmm0, SCALEBITS ; xmm0=YOL + psrld xmm4, SCALEBITS ; xmm4=YOH + packssdw xmm0, xmm4 ; xmm0=YO + + pxor xmm3, xmm3 + pxor xmm4, xmm4 + punpcklwd xmm3, xmm1 ; xmm3=ROL + punpckhwd xmm4, xmm1 ; xmm4=ROH + psrld xmm3, 1 ; xmm3=ROL*FIX(0.500) + psrld xmm4, 1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7, xmm3 + paddd xmm5, xmm4 + paddd xmm7, xmm1 + paddd xmm5, xmm1 + psrld xmm7, SCALEBITS ; xmm7=CrOL + psrld xmm5, SCALEBITS ; xmm5=CrOH + packssdw xmm7, xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4, xmm6 + punpcklwd xmm6, xmm2 + punpckhwd xmm4, xmm2 + movdqa xmm1, xmm6 + movdqa xmm5, xmm4 + pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1, [GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6, xmm2 + paddd xmm4, xmm2 + psrld xmm6, SCALEBITS ; xmm6=YEL + psrld xmm4, SCALEBITS ; xmm4=YEH + packssdw xmm6, xmm4 ; xmm6=YE + + psllw xmm0, BYTE_BIT + por xmm6, xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + pxor xmm2, xmm2 + pxor xmm4, xmm4 + punpcklwd xmm2, xmm3 ; xmm2=REL + punpckhwd xmm4, xmm3 ; xmm4=REH + psrld xmm2, 1 ; xmm2=REL*FIX(0.500) + psrld xmm4, 1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1, xmm2 + paddd xmm5, xmm4 + paddd xmm1, xmm0 + paddd xmm5, xmm0 + psrld xmm1, SCALEBITS ; xmm1=CrEL + psrld xmm5, SCALEBITS ; xmm5=CrEH + packssdw xmm1, xmm5 ; xmm1=CrE + + psllw xmm7, BYTE_BIT + por xmm1, xmm7 ; xmm1=Cr + movdqa XMMWORD [edx], xmm1 ; Save Cr + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + add ebx, byte SIZEOF_XMMWORD ; outptr1 + add edx, byte SIZEOF_XMMWORD ; outptr2 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jccolor-avx2.asm b/media/libjpeg/simd/i386/jccolor-avx2.asm new file mode 100644 index 0000000000..14944e952f --- /dev/null +++ b/media/libjpeg/simd/i386/jccolor-avx2.asm @@ -0,0 +1,121 @@ +; +; jccolor.asm - colorspace conversion (AVX2) +; +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_ycc_convert_avx2) + +EXTN(jconst_rgb_ycc_convert_avx2): + +PW_F0299_F0337 times 8 dw F_0_299, F_0_337 +PW_F0114_F0250 times 8 dw F_0_114, F_0_250 +PW_MF016_MF033 times 8 dw -F_0_168, -F_0_331 +PW_MF008_MF041 times 8 dw -F_0_081, -F_0_418 +PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \ + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2 +%include "jccolext-avx2.asm" diff --git a/media/libjpeg/simd/i386/jccolor-mmx.asm b/media/libjpeg/simd/i386/jccolor-mmx.asm new file mode 100644 index 0000000000..8cb399bdc4 --- /dev/null +++ b/media/libjpeg/simd/i386/jccolor-mmx.asm @@ -0,0 +1,121 @@ +; +; jccolor.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_ycc_convert_mmx) + +EXTN(jconst_rgb_ycc_convert_mmx): + +PW_F0299_F0337 times 2 dw F_0_299, F_0_337 +PW_F0114_F0250 times 2 dw F_0_114, F_0_250 +PW_MF016_MF033 times 2 dw -F_0_168, -F_0_331 +PW_MF008_MF041 times 2 dw -F_0_081, -F_0_418 +PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS - 1)) - 1 + \ + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx +%include "jccolext-mmx.asm" diff --git a/media/libjpeg/simd/i386/jccolor-sse2.asm b/media/libjpeg/simd/i386/jccolor-sse2.asm new file mode 100644 index 0000000000..686d222ff7 --- /dev/null +++ b/media/libjpeg/simd/i386/jccolor-sse2.asm @@ -0,0 +1,120 @@ +; +; jccolor.asm - colorspace conversion (SSE2) +; +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_ycc_convert_sse2) + +EXTN(jconst_rgb_ycc_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PW_MF016_MF033 times 4 dw -F_0_168, -F_0_331 +PW_MF008_MF041 times 4 dw -F_0_081, -F_0_418 +PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \ + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2 +%include "jccolext-sse2.asm" diff --git a/media/libjpeg/simd/i386/jcgray-avx2.asm b/media/libjpeg/simd/i386/jcgray-avx2.asm new file mode 100644 index 0000000000..560ee0c71e --- /dev/null +++ b/media/libjpeg/simd/i386/jcgray-avx2.asm @@ -0,0 +1,113 @@ +; +; jcgray.asm - grayscale colorspace conversion (AVX2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_gray_convert_avx2) + +EXTN(jconst_rgb_gray_convert_avx2): + +PW_F0299_F0337 times 8 dw F_0_299, F_0_337 +PW_F0114_F0250 times 8 dw F_0_114, F_0_250 +PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2 +%include "jcgryext-avx2.asm" diff --git a/media/libjpeg/simd/i386/jcgray-mmx.asm b/media/libjpeg/simd/i386/jcgray-mmx.asm new file mode 100644 index 0000000000..79fdf082a8 --- /dev/null +++ b/media/libjpeg/simd/i386/jcgray-mmx.asm @@ -0,0 +1,113 @@ +; +; jcgray.asm - grayscale colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2011, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_gray_convert_mmx) + +EXTN(jconst_rgb_gray_convert_mmx): + +PW_F0299_F0337 times 2 dw F_0_299, F_0_337 +PW_F0114_F0250 times 2 dw F_0_114, F_0_250 +PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx +%include "jcgryext-mmx.asm" diff --git a/media/libjpeg/simd/i386/jcgray-sse2.asm b/media/libjpeg/simd/i386/jcgray-sse2.asm new file mode 100644 index 0000000000..cb4b28e8f4 --- /dev/null +++ b/media/libjpeg/simd/i386/jcgray-sse2.asm @@ -0,0 +1,112 @@ +; +; jcgray.asm - grayscale colorspace conversion (SSE2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_gray_convert_sse2) + +EXTN(jconst_rgb_gray_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2 +%include "jcgryext-sse2.asm" diff --git a/media/libjpeg/simd/i386/jcgryext-avx2.asm b/media/libjpeg/simd/i386/jcgryext-avx2.asm new file mode 100644 index 0000000000..3fa7973d72 --- /dev/null +++ b/media/libjpeg/simd/i386/jcgryext-avx2.asm @@ -0,0 +1,457 @@ +; +; jcgryext.asm - grayscale colorspace conversion (AVX2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +%define img_width(b) (b) + 8 ; JDIMENSION img_width +%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf +%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf +%define output_row(b) (b) + 20 ; JDIMENSION output_row +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD + ; ymmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2) + +EXTN(jsimd_rgb_gray_convert_avx2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_YMMWORD + jae near .columnloop + alignx 16, 7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, byte [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, word [esi+ecx] + shl eax, WORD_BIT + or eax, edx +.column_ld4: + vmovd xmmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + vmovd xmmF, XMM_DWORD [esi+ecx] + vpslldq xmmA, xmmA, SIZEOF_DWORD + vpor xmmA, xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + vmovq xmmB, XMM_MMWORD [esi+ecx] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + sub ecx, byte SIZEOF_XMMWORD + vmovdqu xmmB, XMM_MMWORD [esi+ecx] + vperm2i128 ymmA, ymmA, ymmA, 1 + vpor ymmA, ymmB +.column_ld32: + test cl, SIZEOF_YMMWORD + jz short .column_ld64 + sub ecx, byte SIZEOF_YMMWORD + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] +.column_ld64: + test cl, 2*SIZEOF_YMMWORD + mov ecx, SIZEOF_YMMWORD + jz short .rgb_gray_cnv + vmovdqa ymmB, ymmA + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 + +.columnloop: + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + vmovdqu ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD] + +.rgb_gray_cnv: + ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vmovdqu ymmC, ymmA + vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + + vmovdqa ymmG, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 + ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) + vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I + ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A + ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) + vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 + ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) + + vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D + ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) + vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F + ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) + + vmovdqa ymmD, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 + ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) + vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P + ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) + vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B + ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) + + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E + ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F + ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) + + vmovdqa ymmE, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C + ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) + vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S + ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D + ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) + + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F + ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F + ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) + + vpxor ymmH, ymmH, ymmH + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmB, ymmE + vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + + vmovdqa ymmF, ymmD + vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + vmovd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + vmovq xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmF +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + vmovdqa xmmF, xmmA + vperm2i128 ymmF, ymmF, ymmF, 1 + vmovdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] + vpor ymmA, ymmA, ymmF +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + jz short .column_ld16 + sub ecx, byte SIZEOF_XMMWORD/2 + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld16: + test cl, SIZEOF_XMMWORD + mov ecx, SIZEOF_YMMWORD + jz short .rgb_gray_cnv + vmovdqa ymmE, ymmA + vmovdqa ymmH, ymmF + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 + +.columnloop: + vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] + vmovdqu ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD] + vmovdqu ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD] + +.rgb_gray_cnv: + ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmB, ymmA + vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + + vmovdqa ymmB, ymmF + vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmD, ymmA + vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 + ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) + vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 + ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) + + vmovdqa ymmC, ymmF + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D + ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F + ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) + + vmovdqa ymmB, ymmA + vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) + vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D + ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) + + vmovdqa ymmG, ymmD + vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E + ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) + vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F + ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) + + vmovdqa ymmE, ymmA + vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E + ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) + + vmovdqa ymmH, ymmB + vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F + ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F + ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) + + vpxor ymmF, ymmF, ymmF + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmD, ymmB + vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + + vmovdqa ymmG, ymmE + vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) + + vpunpcklbw ymmF, ymmF, ymmH + vpunpckhbw ymmH, ymmH, ymmH + vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE + ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + vmovdqa ymm6, ymm1 + vpunpcklwd ymm1, ymm1, ymm3 + vpunpckhwd ymm6, ymm6, ymm3 + vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + vmovdqa ymm6, ymm0 + vpunpcklwd ymm0, ymm0, ymm2 + vpunpckhwd ymm6, ymm6, ymm2 + vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) + + vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + vmovdqa ymm0, ymm5 ; ymm0=BO + vmovdqa ymm6, ymm4 ; ymm6=BE + + vmovdqa ymm4, ymm0 + vpunpcklwd ymm0, ymm0, ymm3 + vpunpckhwd ymm4, ymm4, ymm3 + vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + vmovdqa ymm3, [GOTOFF(eax,PD_ONEHALF)] ; ymm3=[PD_ONEHALF] + + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm4, ymm4, ymm7 + vpaddd ymm0, ymm0, ymm3 + vpaddd ymm4, ymm4, ymm3 + vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH + vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO + + vmovdqa ymm4, ymm6 + vpunpcklwd ymm6, ymm6, ymm2 + vpunpckhwd ymm4, ymm4, ymm2 + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + vmovdqa ymm2, [GOTOFF(eax,PD_ONEHALF)] ; ymm2=[PD_ONEHALF] + + vpaddd ymm6, ymm6, YMMWORD [wk(0)] + vpaddd ymm4, ymm4, YMMWORD [wk(1)] + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm4, ymm4, ymm2 + vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH + vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE + + vpsllw ymm0, ymm0, BYTE_BIT + vpor ymm6, ymm6, ymm0 ; ymm6=Y + vmovdqu YMMWORD [edi], ymm6 ; Save Y + + sub ecx, byte SIZEOF_YMMWORD + add esi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr + add edi, byte SIZEOF_YMMWORD ; outptr0 + cmp ecx, byte SIZEOF_YMMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jcgryext-mmx.asm b/media/libjpeg/simd/i386/jcgryext-mmx.asm new file mode 100644 index 0000000000..8af42e5a33 --- /dev/null +++ b/media/libjpeg/simd/i386/jcgryext-mmx.asm @@ -0,0 +1,355 @@ +; +; jcgryext.asm - grayscale colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2011, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +%define img_width(b) (b) + 8 ; JDIMENSION img_width +%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf +%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf +%define output_row(b) (b) + 20 ; JDIMENSION output_row +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx) + +EXTN(jsimd_rgb_gray_convert_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] ; num_cols + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_MMWORD + jae short .columnloop + alignx 16, 7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + xor eax, eax + mov al, byte [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + xor edx, edx + mov dx, word [esi+ecx] + shl eax, WORD_BIT + or eax, edx +.column_ld4: + movd mmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd mmG, dword [esi+ecx] + psllq mmA, DWORD_BIT + por mmA, mmG +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + movq mmG, mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + mov ecx, SIZEOF_MMWORD + jmp short .rgb_gray_cnv +.column_ld16: + test cl, 2*SIZEOF_MMWORD + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmF, mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] + +.rgb_gray_cnv: + ; mmA=(00 10 20 01 11 21 02 12) + ; mmG=(22 03 13 23 04 14 24 05) + ; mmF=(15 25 06 16 26 07 17 27) + + movq mmD, mmA + psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) + psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) + + punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05) + psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) + + punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16) + punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27) + + movq mmE, mmA + psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) + psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) + + punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16) + psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) + + punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07) + punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27) + + pxor mmH, mmH + + movq mmC, mmA + punpcklbw mmA, mmH ; mmA=(00 02 04 06) + punpckhbw mmC, mmH ; mmC=(10 12 14 16) + + movq mmB, mmE + punpcklbw mmE, mmH ; mmE=(20 22 24 26) + punpckhbw mmB, mmH ; mmB=(01 03 05 07) + + movq mmF, mmD + punpcklbw mmD, mmH ; mmD=(11 13 15 17) + punpckhbw mmF, mmH ; mmF=(21 23 25 27) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_MMWORD/8 + jz short .column_ld2 + sub ecx, byte SIZEOF_MMWORD/8 + movd mmA, dword [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_MMWORD/4 + jz short .column_ld4 + sub ecx, byte SIZEOF_MMWORD/4 + movq mmF, mmA + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld4: + test cl, SIZEOF_MMWORD/2 + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmD, mmA + movq mmC, mmF + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] + +.rgb_gray_cnv: + ; mmA=(00 10 20 30 01 11 21 31) + ; mmF=(02 12 22 32 03 13 23 33) + ; mmD=(04 14 24 34 05 15 25 35) + ; mmC=(06 16 26 36 07 17 27 37) + + movq mmB, mmA + punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32) + punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33) + + movq mmG, mmD + punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36) + punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37) + + movq mmE, mmA + punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16) + punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36) + + movq mmH, mmB + punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17) + punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37) + + pxor mmF, mmF + + movq mmC, mmA + punpcklbw mmA, mmF ; mmA=(00 02 04 06) + punpckhbw mmC, mmF ; mmC=(10 12 14 16) + + movq mmD, mmB + punpcklbw mmB, mmF ; mmB=(01 03 05 07) + punpckhbw mmD, mmF ; mmD=(11 13 15 17) + + movq mmG, mmE + punpcklbw mmE, mmF ; mmE=(20 22 24 26) + punpckhbw mmG, mmF ; mmG=(30 32 34 36) + + punpcklbw mmF, mmH + punpckhbw mmH, mmH + psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27) + psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movq mm6, mm1 + punpcklwd mm1, mm3 + punpckhwd mm6, mm3 + pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm6, mm0 + punpcklwd mm0, mm2 + punpckhwd mm6, mm2 + pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movq mm0, mm5 ; mm0=BO + movq mm6, mm4 ; mm6=BE + + movq mm4, mm0 + punpcklwd mm0, mm3 + punpckhwd mm4, mm3 + pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] + + paddd mm0, mm1 + paddd mm4, mm7 + paddd mm0, mm3 + paddd mm4, mm3 + psrld mm0, SCALEBITS ; mm0=YOL + psrld mm4, SCALEBITS ; mm4=YOH + packssdw mm0, mm4 ; mm0=YO + + movq mm4, mm6 + punpcklwd mm6, mm2 + punpckhwd mm4, mm2 + pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] + + paddd mm6, MMWORD [wk(0)] + paddd mm4, MMWORD [wk(1)] + paddd mm6, mm2 + paddd mm4, mm2 + psrld mm6, SCALEBITS ; mm6=YEL + psrld mm4, SCALEBITS ; mm4=YEH + packssdw mm6, mm4 ; mm6=YE + + psllw mm0, BYTE_BIT + por mm6, mm0 ; mm6=Y + movq MMWORD [edi], mm6 ; Save Y + + sub ecx, byte SIZEOF_MMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr + add edi, byte SIZEOF_MMWORD ; outptr0 + cmp ecx, byte SIZEOF_MMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jcgryext-sse2.asm b/media/libjpeg/simd/i386/jcgryext-sse2.asm new file mode 100644 index 0000000000..c9d6ff1e35 --- /dev/null +++ b/media/libjpeg/simd/i386/jcgryext-sse2.asm @@ -0,0 +1,382 @@ +; +; jcgryext.asm - grayscale colorspace conversion (SSE2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +%define img_width(b) (b) + 8 ; JDIMENSION img_width +%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf +%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf +%define output_row(b) (b) + 20 ; JDIMENSION output_row +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2) + +EXTN(jsimd_rgb_gray_convert_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16, 7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, byte [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, word [esi+ecx] + shl eax, WORD_BIT + or eax, edx +.column_ld4: + movd xmmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF, xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB, xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH, xmmH + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB, xmmE + punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF, xmmD + punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE, xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF, xmmA + movdqa xmmH, xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC, xmmF + punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB, xmmA + punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG, xmmD + punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE, xmmA + punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH, xmmB + punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF, xmmF + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD, xmmB + punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG, xmmE + punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF, xmmH + punpckhbw xmmH, xmmH + psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm6, xmm3 + pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm6, xmm2 + pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 + punpckhwd xmm4, xmm3 + pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0, xmm3 + paddd xmm4, xmm3 + psrld xmm0, SCALEBITS ; xmm0=YOL + psrld xmm4, SCALEBITS ; xmm4=YOH + packssdw xmm0, xmm4 ; xmm0=YO + + movdqa xmm4, xmm6 + punpcklwd xmm6, xmm2 + punpckhwd xmm4, xmm2 + pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6, xmm2 + paddd xmm4, xmm2 + psrld xmm6, SCALEBITS ; xmm6=YEL + psrld xmm4, SCALEBITS ; xmm4=YEH + packssdw xmm6, xmm4 ; xmm6=YE + + psllw xmm0, BYTE_BIT + por xmm6, xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jchuff-sse2.asm b/media/libjpeg/simd/i386/jchuff-sse2.asm new file mode 100644 index 0000000000..d0112e6107 --- /dev/null +++ b/media/libjpeg/simd/i386/jchuff-sse2.asm @@ -0,0 +1,423 @@ +; +; jchuff-sse2.asm - Huffman entropy encoding (SSE2) +; +; Copyright (C) 2009-2011, 2014-2017, D. R. Commander. +; Copyright (C) 2015, Matthieu Darbois. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation for Huffman coding of one block. +; The following code is based directly on jchuff.c; see jchuff.c for more +; details. + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_huff_encode_one_block) + EXTERN EXTN(jpeg_nbits_table) + +EXTN(jconst_huff_encode_one_block): + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +; These macros perform the same task as the emit_bits() function in the +; original libjpeg code. In addition to reducing overhead by explicitly +; inlining the code, additional performance is achieved by taking into +; account the size of the bit buffer and waiting until it is almost full +; before emptying it. This mostly benefits 64-bit platforms, since 6 +; bytes can be stored in a 64-bit bit buffer before it has to be emptied. + +%macro EMIT_BYTE 0 + sub put_bits, 8 ; put_bits -= 8; + mov edx, put_buffer + mov ecx, put_bits + shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); + mov byte [eax], dl ; *buffer++ = c; + add eax, 1 + cmp dl, 0xFF ; need to stuff a zero byte? + jne %%.EMIT_BYTE_END + mov byte [eax], 0 ; *buffer++ = 0; + add eax, 1 +%%.EMIT_BYTE_END: +%endmacro + +%macro PUT_BITS 1 + add put_bits, ecx ; put_bits += size; + shl put_buffer, cl ; put_buffer = (put_buffer << size); + or put_buffer, %1 +%endmacro + +%macro CHECKBUF15 0 + cmp put_bits, 16 ; if (put_bits > 31) { + jl %%.CHECKBUF15_END + mov eax, POINTER [esp+buffer] + EMIT_BYTE + EMIT_BYTE + mov POINTER [esp+buffer], eax +%%.CHECKBUF15_END: +%endmacro + +%macro EMIT_BITS 1 + PUT_BITS %1 + CHECKBUF15 +%endmacro + +%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) + pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128(); + pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128(); + pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128(); + pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128(); + pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; + pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; + pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; + pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; + pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; + pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; + pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; + pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; + pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; + pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; + pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; + pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; + pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; + pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; + pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; + pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; + pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; + pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; + pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; + pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; + pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; + pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; + pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; + pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; + pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; + pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; + pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; + pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; + pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; + pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; + pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; +%if %1 != 32 + pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; +%else + pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31]; +%endif + pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1); + paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg); + paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg); + paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg); + paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg); + pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg); + pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg); + pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg); + pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg); + pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1); + pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1); + pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1); + pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1); + movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); + movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); + movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); + movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); + movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); + movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); + movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); + movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); +%endmacro + +; +; Encode a single block's worth of coefficients. +; +; GLOBAL(JOCTET *) +; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer, +; JCOEFPTR block, int last_dc_val, +; c_derived_tbl *dctbl, c_derived_tbl *actbl) +; + +; eax + 8 = working_state *state +; eax + 12 = JOCTET *buffer +; eax + 16 = JCOEFPTR block +; eax + 20 = int last_dc_val +; eax + 24 = c_derived_tbl *dctbl +; eax + 28 = c_derived_tbl *actbl + +%define pad 6 * SIZEOF_DWORD ; Align to 16 bytes +%define t1 pad +%define t2 t1 + (DCTSIZE2 * SIZEOF_WORD) +%define block t2 + (DCTSIZE2 * SIZEOF_WORD) +%define actbl block + SIZEOF_DWORD +%define buffer actbl + SIZEOF_DWORD +%define temp buffer + SIZEOF_DWORD +%define temp2 temp + SIZEOF_DWORD +%define temp3 temp2 + SIZEOF_DWORD +%define temp4 temp3 + SIZEOF_DWORD +%define temp5 temp4 + SIZEOF_DWORD +%define gotptr temp5 + SIZEOF_DWORD ; void *gotptr +%define put_buffer ebx +%define put_bits edi + + align 32 + GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) + +EXTN(jsimd_huff_encode_one_block_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + sub esp, temp5+9*SIZEOF_DWORD-pad + push ebx + push ecx +; push edx ; need not be preserved + push esi + push edi + push ebp + + mov esi, POINTER [eax+8] ; (working_state *state) + mov put_buffer, dword [esi+8] ; put_buffer = state->cur.put_buffer; + mov put_bits, dword [esi+12] ; put_bits = state->cur.put_bits; + push esi ; esi is now scratch + + get_GOT edx ; get GOT address + movpic POINTER [esp+gotptr], edx ; save GOT address + + mov ecx, POINTER [eax+28] + mov edx, POINTER [eax+16] + mov esi, POINTER [eax+12] + mov POINTER [esp+actbl], ecx + mov POINTER [esp+block], edx + mov POINTER [esp+buffer], esi + + ; Encode the DC coefficient difference per section F.1.2.1 + mov esi, POINTER [esp+block] ; block + movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val; + sub ecx, dword [eax+20] + mov esi, ecx + + ; This is a well-known technique for obtaining the absolute value + ; with out a branch. It is derived from an assembly language technique + ; presented in "How to Optimize for the Pentium Processors", + ; Copyright (c) 1996, 1997 by Agner Fog. + mov edx, ecx + sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); + xor ecx, edx ; temp ^= temp3; + sub ecx, edx ; temp -= temp3; + + ; For a negative input, want temp2 = bitwise complement of abs(input) + ; This code assumes we are on a two's complement machine + add esi, edx ; temp2 += temp3; + mov dword [esp+temp], esi ; backup temp2 in temp + + ; Find the number of bits needed for the magnitude of the coefficient + movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp) + movzx edx, byte [GOTOFF(ebp, EXTN(jpeg_nbits_table) + ecx)] ; nbits = JPEG_NBITS(temp); + mov dword [esp+temp2], edx ; backup nbits in temp2 + + ; Emit the Huffman-coded symbol for the number of bits + mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore + mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits]; + movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits]; + EMIT_BITS eax ; EMIT_BITS(code, size) + + mov ecx, dword [esp+temp2] ; restore nbits + + ; Mask off any extra bits in code + mov eax, 1 + shl eax, cl + dec eax + and eax, dword [esp+temp] ; temp2 &= (((JLONG)1)<<nbits) - 1; + + ; Emit that number of bits of the value, if positive, + ; or the complement of its magnitude, if negative. + EMIT_BITS eax ; EMIT_BITS(temp2, nbits) + + ; Prepare data + xor ecx, ecx + mov esi, POINTER [esp+block] + kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \ + 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \ + 27, 20, 13, 6, 7, 14, 21, 28, 35, \ + xmm0, xmm1, xmm2, xmm3 + kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \ + 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \ + 53, 60, 61, 54, 47, 55, 62, 63, 63, \ + xmm0, xmm1, xmm2, xmm3 + + pxor xmm7, xmm7 + movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0)); + movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8)); + movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16)); + movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24)); + pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); + pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); + pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); + pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); + packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); + packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); + pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; + pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; + shl ecx, 16 + or edx, ecx + not edx ; index = ~index; + + lea esi, [esp+t1] + mov ebp, POINTER [esp+actbl] ; ebp = actbl + +.BLOOP: + bsf ecx, edx ; r = __builtin_ctzl(index); + jz near .ELOOP + lea esi, [esi+ecx*2] ; k += r; + shr edx, cl ; index >>= r; + mov dword [esp+temp3], edx +.BRLOOP: + cmp ecx, 16 ; while (r > 15) { + jl near .ERLOOP + sub ecx, 16 ; r -= 16; + mov dword [esp+temp], ecx + mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; + movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; + EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) + mov ecx, dword [esp+temp] + jmp .BRLOOP +.ERLOOP: + movsx eax, word [esi] ; temp = t1[k]; + movpic edx, POINTER [esp+gotptr] ; load GOT address (edx) + movzx eax, byte [GOTOFF(edx, EXTN(jpeg_nbits_table) + eax)] ; nbits = JPEG_NBITS(temp); + mov dword [esp+temp2], eax + ; Emit Huffman symbol for run length / number of bits + shl ecx, 4 ; temp3 = (r << 4) + nbits; + add ecx, eax + mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; + movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; + EMIT_BITS eax + + movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; + ; Mask off any extra bits in code + mov ecx, dword [esp+temp2] + mov eax, 1 + shl eax, cl + dec eax + and eax, edx ; temp2 &= (((JLONG)1)<<nbits) - 1; + EMIT_BITS eax ; PUT_BITS(temp2, nbits) + mov edx, dword [esp+temp3] + add esi, 2 ; ++k; + shr edx, 1 ; index >>= 1; + + jmp .BLOOP +.ELOOP: + movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0)); + movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8)); + movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16)); + movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24)); + pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); + pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); + pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); + pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); + packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); + packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); + pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; + pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; + shl ecx, 16 + or edx, ecx + not edx ; index = ~index; + + lea eax, [esp + t1 + (DCTSIZE2/2) * 2] + sub eax, esi + shr eax, 1 + bsf ecx, edx ; r = __builtin_ctzl(index); + jz near .ELOOP2 + shr edx, cl ; index >>= r; + add ecx, eax + lea esi, [esi+ecx*2] ; k += r; + mov dword [esp+temp3], edx + jmp .BRLOOP2 +.BLOOP2: + bsf ecx, edx ; r = __builtin_ctzl(index); + jz near .ELOOP2 + lea esi, [esi+ecx*2] ; k += r; + shr edx, cl ; index >>= r; + mov dword [esp+temp3], edx +.BRLOOP2: + cmp ecx, 16 ; while (r > 15) { + jl near .ERLOOP2 + sub ecx, 16 ; r -= 16; + mov dword [esp+temp], ecx + mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; + movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; + EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) + mov ecx, dword [esp+temp] + jmp .BRLOOP2 +.ERLOOP2: + movsx eax, word [esi] ; temp = t1[k]; + bsr eax, eax ; nbits = 32 - __builtin_clz(temp); + inc eax + mov dword [esp+temp2], eax + ; Emit Huffman symbol for run length / number of bits + shl ecx, 4 ; temp3 = (r << 4) + nbits; + add ecx, eax + mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; + movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; + EMIT_BITS eax + + movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; + ; Mask off any extra bits in code + mov ecx, dword [esp+temp2] + mov eax, 1 + shl eax, cl + dec eax + and eax, edx ; temp2 &= (((JLONG)1)<<nbits) - 1; + EMIT_BITS eax ; PUT_BITS(temp2, nbits) + mov edx, dword [esp+temp3] + add esi, 2 ; ++k; + shr edx, 1 ; index >>= 1; + + jmp .BLOOP2 +.ELOOP2: + ; If the last coef(s) were zero, emit an end-of-block code + lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; + cmp edx, esi ; if (r > 0) { + je .EFN + mov eax, INT [ebp] ; code = actbl->ehufco[0]; + movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0]; + EMIT_BITS eax +.EFN: + mov eax, [esp+buffer] + pop esi + ; Save put_buffer & put_bits + mov dword [esi+8], put_buffer ; state->cur.put_buffer = put_buffer; + mov dword [esi+12], put_bits ; state->cur.put_bits = put_bits; + + pop ebp + pop edi + pop esi +; pop edx ; need not be preserved + pop ecx + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jcphuff-sse2.asm b/media/libjpeg/simd/i386/jcphuff-sse2.asm new file mode 100644 index 0000000000..8b73178376 --- /dev/null +++ b/media/libjpeg/simd/i386/jcphuff-sse2.asm @@ -0,0 +1,660 @@ +; +; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2) +; +; Copyright (C) 2016, 2018, Matthieu Darbois +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation of data preparation for progressive +; Huffman encoding. See jcphuff.c for more details. + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +; -------------------------------------------------------------------------- +; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and +; jsimd_encode_mcu_AC_refine_prepare_sse2() + +%macro LOAD16 0 + pxor N0, N0 + pxor N1, N1 + + mov T0, INT [LUT + 0*SIZEOF_INT] + mov T1, INT [LUT + 8*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + pinsrw X1, word [BLOCK + T1 * 2], 0 + + mov T0, INT [LUT + 1*SIZEOF_INT] + mov T1, INT [LUT + 9*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + pinsrw X1, word [BLOCK + T1 * 2], 1 + + mov T0, INT [LUT + 2*SIZEOF_INT] + mov T1, INT [LUT + 10*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + pinsrw X1, word [BLOCK + T1 * 2], 2 + + mov T0, INT [LUT + 3*SIZEOF_INT] + mov T1, INT [LUT + 11*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + pinsrw X1, word [BLOCK + T1 * 2], 3 + + mov T0, INT [LUT + 4*SIZEOF_INT] + mov T1, INT [LUT + 12*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + pinsrw X1, word [BLOCK + T1 * 2], 4 + + mov T0, INT [LUT + 5*SIZEOF_INT] + mov T1, INT [LUT + 13*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + pinsrw X1, word [BLOCK + T1 * 2], 5 + + mov T0, INT [LUT + 6*SIZEOF_INT] + mov T1, INT [LUT + 14*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + pinsrw X1, word [BLOCK + T1 * 2], 6 + + mov T0, INT [LUT + 7*SIZEOF_INT] + mov T1, INT [LUT + 15*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 + pinsrw X1, word [BLOCK + T1 * 2], 7 +%endmacro + +%macro LOAD15 0 + pxor N0, N0 + pxor N1, N1 + pxor X1, X1 + + mov T0, INT [LUT + 0*SIZEOF_INT] + mov T1, INT [LUT + 8*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + pinsrw X1, word [BLOCK + T1 * 2], 0 + + mov T0, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + + mov T0, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + + mov T0, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + + mov T0, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + + mov T0, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + + mov T0, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + + mov T0, INT [LUT + 7*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 + + cmp LENEND, 2 + jl %%.ELOAD15 + mov T1, INT [LUT + 9*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 1 + + cmp LENEND, 3 + jl %%.ELOAD15 + mov T1, INT [LUT + 10*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 2 + + cmp LENEND, 4 + jl %%.ELOAD15 + mov T1, INT [LUT + 11*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 3 + + cmp LENEND, 5 + jl %%.ELOAD15 + mov T1, INT [LUT + 12*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 4 + + cmp LENEND, 6 + jl %%.ELOAD15 + mov T1, INT [LUT + 13*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 5 + + cmp LENEND, 7 + jl %%.ELOAD15 + mov T1, INT [LUT + 14*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 6 +%%.ELOAD15: +%endmacro + +%macro LOAD8 0 + pxor N0, N0 + + mov T0, INT [LUT + 0*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + + mov T0, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + + mov T0, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + + mov T0, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + + mov T0, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + + mov T0, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + + mov T0, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + + mov T0, INT [LUT + 7*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 +%endmacro + +%macro LOAD7 0 + pxor N0, N0 + pxor X0, X0 + + mov T1, INT [LUT + 0*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 0 + + cmp LENEND, 2 + jl %%.ELOAD7 + mov T1, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 1 + + cmp LENEND, 3 + jl %%.ELOAD7 + mov T1, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 2 + + cmp LENEND, 4 + jl %%.ELOAD7 + mov T1, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 3 + + cmp LENEND, 5 + jl %%.ELOAD7 + mov T1, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 4 + + cmp LENEND, 6 + jl %%.ELOAD7 + mov T1, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 5 + + cmp LENEND, 7 + jl %%.ELOAD7 + mov T1, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 6 +%%.ELOAD7: +%endmacro + +%macro REDUCE0 0 + movdqa xmm0, XMMWORD [VALUES + ( 0*2)] + movdqa xmm1, XMMWORD [VALUES + ( 8*2)] + movdqa xmm2, XMMWORD [VALUES + (16*2)] + movdqa xmm3, XMMWORD [VALUES + (24*2)] + movdqa xmm4, XMMWORD [VALUES + (32*2)] + movdqa xmm5, XMMWORD [VALUES + (40*2)] + movdqa xmm6, XMMWORD [VALUES + (48*2)] + + pcmpeqw xmm0, ZERO + pcmpeqw xmm1, ZERO + pcmpeqw xmm2, ZERO + pcmpeqw xmm3, ZERO + pcmpeqw xmm4, ZERO + pcmpeqw xmm5, ZERO + pcmpeqw xmm6, ZERO + pcmpeqw xmm7, XMMWORD [VALUES + (56*2)] + + packsswb xmm0, xmm1 + packsswb xmm2, xmm3 + packsswb xmm4, xmm5 + packsswb xmm6, xmm7 + + pmovmskb eax, xmm0 + pmovmskb ecx, xmm2 + pmovmskb edx, xmm4 + pmovmskb esi, xmm6 + + shl ecx, 16 + shl esi, 16 + + or eax, ecx + or edx, esi + + not eax + not edx + + mov edi, ZEROBITS + + mov INT [edi], eax + mov INT [edi+SIZEOF_INT], edx +%endmacro + +; +; Prepare data for jsimd_encode_mcu_AC_first(). +; +; GLOBAL(void) +; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, +; const int *jpeg_natural_order_start, +; int Sl, int Al, JCOEF *values, +; size_t *zerobits) +; +; eax + 8 = const JCOEF *block +; eax + 12 = const int *jpeg_natural_order_start +; eax + 16 = int Sl +; eax + 20 = int Al +; eax + 24 = JCOEF *values +; eax + 28 = size_t *zerobits + +%define ZERO xmm7 +%define X0 xmm0 +%define X1 xmm1 +%define N0 xmm2 +%define N1 xmm3 +%define AL xmm4 +%define K eax +%define LENEND eax +%define LUT ebx +%define T0 ecx +%define T1 edx +%define BLOCK esi +%define VALUES edi +%define LEN ebp + +%define ZEROBITS INT [esp + 5 * 4] + + align 32 + GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) + +EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + sub esp, 4 + push ebx + push ecx +; push edx ; need not be preserved + push esi + push edi + push ebp + + mov BLOCK, INT [eax + 8] + mov LUT, INT [eax + 12] + mov VALUES, INT [eax + 24] + movd AL, INT [eax + 20] + mov T0, INT [eax + 28] + mov ZEROBITS, T0 + mov LEN, INT [eax + 16] + pxor ZERO, ZERO + mov K, LEN + and K, -16 + shr K, 4 + jz .ELOOP16 +.BLOOP16: + LOAD16 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + pxor N0, X0 + pxor N1, X1 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 + add VALUES, 16*2 + add LUT, 16*SIZEOF_INT + dec K + jnz .BLOOP16 + test LEN, 15 + je .PADDING +.ELOOP16: + mov LENEND, LEN + and LENEND, 7 + + test LEN, 8 + jz .TRY7 + test LEN, 7 + jz .TRY8 + + LOAD15 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + pxor N0, X0 + pxor N1, X1 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 + add VALUES, 16*2 + jmp .PADDING +.TRY8: + LOAD8 + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + pxor N0, X0 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + add VALUES, 8*2 + jmp .PADDING +.TRY7: + LOAD7 + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + pxor N0, X0 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + add VALUES, 8*2 +.PADDING: + mov K, LEN + add K, 7 + and K, -8 + shr K, 3 + sub K, DCTSIZE2/8 + jz .EPADDING + align 16 +.ZEROLOOP: + movdqa XMMWORD [VALUES + 0], ZERO + add VALUES, 8*2 + inc K + jnz .ZEROLOOP +.EPADDING: + sub VALUES, DCTSIZE2*2 + + REDUCE0 + + pop ebp + pop edi + pop esi +; pop edx ; need not be preserved + pop ecx + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +%undef ZERO +%undef X0 +%undef X1 +%undef N0 +%undef N1 +%undef AL +%undef K +%undef LUT +%undef T0 +%undef T1 +%undef BLOCK +%undef VALUES +%undef LEN + +; +; Prepare data for jsimd_encode_mcu_AC_refine(). +; +; GLOBAL(int) +; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, +; const int *jpeg_natural_order_start, +; int Sl, int Al, JCOEF *absvalues, +; size_t *bits) +; +; eax + 8 = const JCOEF *block +; eax + 12 = const int *jpeg_natural_order_start +; eax + 16 = int Sl +; eax + 20 = int Al +; eax + 24 = JCOEF *values +; eax + 28 = size_t *bits + +%define ZERO xmm7 +%define ONE xmm5 +%define X0 xmm0 +%define X1 xmm1 +%define N0 xmm2 +%define N1 xmm3 +%define AL xmm4 +%define K eax +%define LENEND eax +%define LUT ebx +%define T0 ecx +%define T0w cx +%define T1 edx +%define BLOCK esi +%define VALUES edi +%define KK ebp + +%define ZEROBITS INT [esp + 5 * 4] +%define EOB INT [esp + 5 * 4 + 4] +%define LEN INT [esp + 5 * 4 + 8] + + align 32 + GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) + +EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + sub esp, 16 + push ebx + push ecx +; push edx ; need not be preserved + push esi + push edi + push ebp + + pcmpeqw ONE, ONE + psrlw ONE, 15 + mov BLOCK, INT [eax + 8] + mov LUT, INT [eax + 12] + mov VALUES, INT [eax + 24] + movd AL, INT [eax + 20] + mov T0, INT [eax + 28] + mov K, INT [eax + 16] + mov INT [T0 + 2 * SIZEOF_INT], -1 + mov INT [T0 + 3 * SIZEOF_INT], -1 + mov ZEROBITS, T0 + mov LEN, K + pxor ZERO, ZERO + and K, -16 + mov EOB, 0 + xor KK, KK + shr K, 4 + jz .ELOOPR16 +.BLOOPR16: + LOAD16 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + pcmpeqw X0, ONE + pcmpeqw X1, ONE + packsswb N0, N1 + packsswb X0, X1 + pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + mov T1, ZEROBITS + not T0 + mov word [T1 + 2 * SIZEOF_INT + KK], T0w + pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); + bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER16 ; if (idx) { + lea T1, [T1+KK*8] + mov EOB, T1 ; EOB = k + idx; +.CONTINUER16: + add VALUES, 16*2 + add LUT, 16*SIZEOF_INT + add KK, 2 + dec K + jnz .BLOOPR16 +.ELOOPR16: + mov LENEND, LEN + + test LENEND, 8 + jz .TRYR7 + test LENEND, 7 + jz .TRYR8 + + and LENEND, 7 + LOAD15 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + pcmpeqw X0, ONE + pcmpeqw X1, ONE + packsswb N0, N1 + packsswb X0, X1 + pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + mov T1, ZEROBITS + not T0 + mov word [T1 + 2 * SIZEOF_INT + KK], T0w + pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); + bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER15 ; if (idx) { + lea T1, [T1+KK*8] + mov EOB, T1 ; EOB = k + idx; +.CONTINUER15: + add VALUES, 16*2 + jmp .PADDINGR +.TRYR8: + LOAD8 + + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + pcmpeqw X0, ONE + packsswb N0, ZERO + packsswb X0, ZERO + pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + mov T1, ZEROBITS + not T0 + mov word [T1 + 2 * SIZEOF_INT + KK], T0w + pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); + bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER8 ; if (idx) { + lea T1, [T1+KK*8] + mov EOB, T1 ; EOB = k + idx; +.CONTINUER8: + add VALUES, 8*2 + jmp .PADDINGR +.TRYR7: + and LENEND, 7 + LOAD7 + + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + pcmpeqw X0, ONE + packsswb N0, ZERO + packsswb X0, ZERO + pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + mov T1, ZEROBITS + not T0 + mov word [T1 + 2 * SIZEOF_INT + KK], T0w + pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); + bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER7 ; if (idx) { + lea T1, [T1+KK*8] + mov EOB, T1 ; EOB = k + idx; +.CONTINUER7: + add VALUES, 8*2 +.PADDINGR: + mov K, LEN + add K, 7 + and K, -8 + shr K, 3 + sub K, DCTSIZE2/8 + jz .EPADDINGR + align 16 +.ZEROLOOPR: + movdqa XMMWORD [VALUES + 0], ZERO + add VALUES, 8*2 + inc K + jnz .ZEROLOOPR +.EPADDINGR: + sub VALUES, DCTSIZE2*2 + + REDUCE0 + + mov eax, EOB + + pop ebp + pop edi + pop esi +; pop edx ; need not be preserved + pop ecx + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +%undef ZERO +%undef ONE +%undef X0 +%undef X1 +%undef N0 +%undef N1 +%undef AL +%undef K +%undef KK +%undef EOB +%undef SIGN +%undef LUT +%undef T0 +%undef T1 +%undef BLOCK +%undef VALUES +%undef LEN +%undef LENEND + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jcsample-avx2.asm b/media/libjpeg/simd/i386/jcsample-avx2.asm new file mode 100644 index 0000000000..0a20802dd8 --- /dev/null +++ b/media/libjpeg/simd/i386/jcsample-avx2.asm @@ -0,0 +1,388 @@ +; +; jcsample.asm - downsampling (AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +%define img_width(b) (b) + 8 ; JDIMENSION image_width +%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor +%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks +%define input_data(b) (b) + 24 ; JSAMPARRAY input_data +%define output_data(b) (b) + 28 ; JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2) + +EXTN(jsimd_h2v1_downsample_avx2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v1_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return + + mov edx, 0x00010000 ; bias pattern + vmovd xmm7, edx + vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7} + vpcmpeqw ymm6, ymm6, ymm6 + vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 +.rowloop: + push ecx + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_YMMWORD + jae short .columnloop + alignx 16, 7 + +.columnloop_r24: + ; ecx can possibly be 8, 16, 24 + cmp ecx, 24 + jne .columnloop_r16 + vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD] + mov ecx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r16: + cmp ecx, 16 + jne .columnloop_r8 + vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] + vpxor ymm1, ymm1, ymm1 + mov ecx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r8: + vmovdqu xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD] + vpxor ymm1, ymm1, ymm1 + mov ecx, SIZEOF_YMMWORD + jmp short .downsample + alignx 16, 7 + +.columnloop: + vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD] + +.downsample: + vpsrlw ymm2, ymm0, BYTE_BIT + vpand ymm0, ymm0, ymm6 + vpsrlw ymm3, ymm1, BYTE_BIT + vpand ymm1, ymm1, ymm6 + + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm1, ymm1, ymm7 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 + + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 + + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 + + sub ecx, byte SIZEOF_YMMWORD ; outcol + add esi, byte 2*SIZEOF_YMMWORD ; inptr + add edi, byte 1*SIZEOF_YMMWORD ; outptr + cmp ecx, byte SIZEOF_YMMWORD + jae short .columnloop + test ecx, ecx + jnz near .columnloop_r24 + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +%define img_width(b) (b) + 8 ; JDIMENSION image_width +%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor +%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks +%define input_data(b) (b) + 24 ; JSAMPARRAY input_data +%define output_data(b) (b) + 28 ; JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2) + +EXTN(jsimd_h2v2_downsample_avx2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v2_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return + + mov edx, 0x00020001 ; bias pattern + vmovd xmm7, edx + vpcmpeqw ymm6, ymm6, ymm6 + vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2} + vperm2i128 ymm7, ymm7, ymm7, 0 + vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 +.rowloop: + push ecx + push edi + push esi + + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_YMMWORD + jae short .columnloop + alignx 16, 7 + +.columnloop_r24: + cmp ecx, 24 + jne .columnloop_r16 + vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD] + vmovdqu xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD] + mov ecx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r16: + cmp ecx, 16 + jne .columnloop_r8 + vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] + vpxor ymm2, ymm2, ymm2 + vpxor ymm3, ymm3, ymm3 + mov ecx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r8: + vmovdqu xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + vmovdqu xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + vpxor ymm2, ymm2, ymm2 + vpxor ymm3, ymm3, ymm3 + mov ecx, SIZEOF_YMMWORD + jmp short .downsample + alignx 16, 7 + +.columnloop: + vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] + vmovdqu ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD] + vmovdqu ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD] + +.downsample: + vpand ymm4, ymm0, ymm6 + vpsrlw ymm0, ymm0, BYTE_BIT + vpand ymm5, ymm1, ymm6 + vpsrlw ymm1, ymm1, BYTE_BIT + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + + vpand ymm4, ymm2, ymm6 + vpsrlw ymm2, ymm2, BYTE_BIT + vpand ymm5, ymm3, ymm6 + vpsrlw ymm3, ymm3, BYTE_BIT + vpaddw ymm2, ymm2, ymm4 + vpaddw ymm3, ymm3, ymm5 + + vpaddw ymm0, ymm0, ymm1 + vpaddw ymm2, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm2, ymm2, ymm7 + vpsrlw ymm0, ymm0, 2 + vpsrlw ymm2, ymm2, 2 + + vpackuswb ymm0, ymm0, ymm2 + vpermq ymm0, ymm0, 0xd8 + + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 + + sub ecx, byte SIZEOF_YMMWORD ; outcol + add edx, byte 2*SIZEOF_YMMWORD ; inptr0 + add esi, byte 2*SIZEOF_YMMWORD ; inptr1 + add edi, byte 1*SIZEOF_YMMWORD ; outptr + cmp ecx, byte SIZEOF_YMMWORD + jae near .columnloop + test ecx, ecx + jnz near .columnloop_r24 + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jcsample-mmx.asm b/media/libjpeg/simd/i386/jcsample-mmx.asm new file mode 100644 index 0000000000..2c223eebe8 --- /dev/null +++ b/media/libjpeg/simd/i386/jcsample-mmx.asm @@ -0,0 +1,324 @@ +; +; jcsample.asm - downsampling (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +%define img_width(b) (b) + 8 ; JDIMENSION image_width +%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor +%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks +%define input_data(b) (b) + 24 ; JSAMPARRAY input_data +%define output_data(b) (b) + 28 ; JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx) + +EXTN(jsimd_h2v1_downsample_mmx): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v1_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return + + mov edx, 0x00010000 ; bias pattern + movd mm7, edx + pcmpeqw mm6, mm6 + punpckldq mm7, mm7 ; mm7={0, 1, 0, 1} + psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 +.rowloop: + push ecx + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] + movq mm2, mm0 + movq mm3, mm1 + + pand mm0, mm6 + psrlw mm2, BYTE_BIT + pand mm1, mm6 + psrlw mm3, BYTE_BIT + + paddw mm0, mm2 + paddw mm1, mm3 + paddw mm0, mm7 + paddw mm1, mm7 + psrlw mm0, 1 + psrlw mm1, 1 + + packuswb mm0, mm1 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + + add esi, byte 2*SIZEOF_MMWORD ; inptr + add edi, byte 1*SIZEOF_MMWORD ; outptr + sub ecx, byte SIZEOF_MMWORD ; outcol + jnz short .columnloop + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg short .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +%define img_width(b) (b) + 8 ; JDIMENSION image_width +%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor +%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks +%define input_data(b) (b) + 24 ; JSAMPARRAY input_data +%define output_data(b) (b) + 28 ; JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx) + +EXTN(jsimd_h2v2_downsample_mmx): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v2_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return + + mov edx, 0x00020001 ; bias pattern + movd mm7, edx + pcmpeqw mm6, mm6 + punpckldq mm7, mm7 ; mm7={1, 2, 1, 2} + psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 +.rowloop: + push ecx + push edi + push esi + + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] + movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] + movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm4, mm0 + movq mm5, mm1 + pand mm0, mm6 + psrlw mm4, BYTE_BIT + pand mm1, mm6 + psrlw mm5, BYTE_BIT + paddw mm0, mm4 + paddw mm1, mm5 + + movq mm4, mm2 + movq mm5, mm3 + pand mm2, mm6 + psrlw mm4, BYTE_BIT + pand mm3, mm6 + psrlw mm5, BYTE_BIT + paddw mm2, mm4 + paddw mm3, mm5 + + paddw mm0, mm1 + paddw mm2, mm3 + paddw mm0, mm7 + paddw mm2, mm7 + psrlw mm0, 2 + psrlw mm2, 2 + + packuswb mm0, mm2 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + + add edx, byte 2*SIZEOF_MMWORD ; inptr0 + add esi, byte 2*SIZEOF_MMWORD ; inptr1 + add edi, byte 1*SIZEOF_MMWORD ; outptr + sub ecx, byte SIZEOF_MMWORD ; outcol + jnz near .columnloop + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jcsample-sse2.asm b/media/libjpeg/simd/i386/jcsample-sse2.asm new file mode 100644 index 0000000000..4fea60d2e2 --- /dev/null +++ b/media/libjpeg/simd/i386/jcsample-sse2.asm @@ -0,0 +1,351 @@ +; +; jcsample.asm - downsampling (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +%define img_width(b) (b) + 8 ; JDIMENSION image_width +%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor +%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks +%define input_data(b) (b) + 24 ; JSAMPARRAY input_data +%define output_data(b) (b) + 28 ; JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2) + +EXTN(jsimd_h2v1_downsample_sse2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v1_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return + + mov edx, 0x00010000 ; bias pattern + movd xmm7, edx + pcmpeqw xmm6, xmm6 + pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 +.rowloop: + push ecx + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16, 7 + +.columnloop_r8: + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm1, xmm1 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16, 7 + +.columnloop: + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + pand xmm0, xmm6 + psrlw xmm2, BYTE_BIT + pand xmm1, xmm6 + psrlw xmm3, BYTE_BIT + + paddw xmm0, xmm2 + paddw xmm1, xmm3 + paddw xmm0, xmm7 + paddw xmm1, xmm7 + psrlw xmm0, 1 + psrlw xmm1, 1 + + packuswb xmm0, xmm1 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + test ecx, ecx + jnz short .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +%define img_width(b) (b) + 8 ; JDIMENSION image_width +%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor +%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks +%define input_data(b) (b) + 24 ; JSAMPARRAY input_data +%define output_data(b) (b) + 28 ; JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2) + +EXTN(jsimd_h2v2_downsample_sse2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v2_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return + + mov edx, 0x00020001 ; bias pattern + movd xmm7, edx + pcmpeqw xmm6, xmm6 + pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 +.rowloop: + push ecx + push edi + push esi + + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16, 7 + +.columnloop_r8: + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm2, xmm2 + pxor xmm3, xmm3 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16, 7 + +.columnloop: + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + pand xmm0, xmm6 + psrlw xmm4, BYTE_BIT + pand xmm1, xmm6 + psrlw xmm5, BYTE_BIT + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movdqa xmm4, xmm2 + movdqa xmm5, xmm3 + pand xmm2, xmm6 + psrlw xmm4, BYTE_BIT + pand xmm3, xmm6 + psrlw xmm5, BYTE_BIT + paddw xmm2, xmm4 + paddw xmm3, xmm5 + + paddw xmm0, xmm1 + paddw xmm2, xmm3 + paddw xmm0, xmm7 + paddw xmm2, xmm7 + psrlw xmm0, 2 + psrlw xmm2, 2 + + packuswb xmm0, xmm2 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add edx, byte 2*SIZEOF_XMMWORD ; inptr0 + add esi, byte 2*SIZEOF_XMMWORD ; inptr1 + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx, ecx + jnz near .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jdcolext-avx2.asm b/media/libjpeg/simd/i386/jdcolext-avx2.asm new file mode 100644 index 0000000000..015be0416c --- /dev/null +++ b/media/libjpeg/simd/i386/jdcolext-avx2.asm @@ -0,0 +1,515 @@ +; +; jdcolext.asm - colorspace conversion (AVX2) +; +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2012, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf, +; JDIMENSION input_row, JSAMPARRAY output_buf, +; int num_rows) +; + +%define out_width(b) (b) + 8 ; JDIMENSION out_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define input_row(b) (b) + 16 ; JDIMENSION input_row +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD + ; ymmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2) + +EXTN(jsimd_ycc_rgb_convert_avx2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16, 7 +.columnloop: + + vmovdqu ymm5, YMMWORD [ebx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) + vmovdqu ymm1, YMMWORD [edx] ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpcmpeqw ymm0, ymm0, ymm0 + vpcmpeqw ymm7, ymm7, ymm7 + vpsrlw ymm0, ymm0, BYTE_BIT ; ymm0={0xFF 0x00 0xFF 0x00 ..} + vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpand ymm4, ymm0, ymm5 ; ymm4=Cb(02468ACEGIKMOQSU)=CbE + vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Cb(13579BDFHJLNPRTV)=CbO + vpand ymm0, ymm0, ymm1 ; ymm0=Cr(02468ACEGIKMOQSU)=CrE + vpsrlw ymm1, ymm1, BYTE_BIT ; ymm1=Cr(13579BDFHJLNPRTV)=CrO + + vpaddw ymm2, ymm4, ymm7 + vpaddw ymm3, ymm5, ymm7 + vpaddw ymm6, ymm0, ymm7 + vpaddw ymm7, ymm1, ymm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbE + vpaddw ymm5, ymm3, ymm3 ; ymm5=2*CbO + vpaddw ymm0, ymm6, ymm6 ; ymm0=2*CrE + vpaddw ymm1, ymm7, ymm7 ; ymm1=2*CrO + + vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbE * -FIX(0.22800)) + vpmulhw ymm5, ymm5, [GOTOFF(eax,PW_MF0228)] ; ymm5=(2*CbO * -FIX(0.22800)) + vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrE * FIX(0.40200)) + vpmulhw ymm1, ymm1, [GOTOFF(eax,PW_F0402)] ; ymm1=(2*CrO * FIX(0.40200)) + + vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)] + vpaddw ymm5, ymm5, [GOTOFF(eax,PW_ONE)] + vpsraw ymm4, ymm4, 1 ; ymm4=(CbE * -FIX(0.22800)) + vpsraw ymm5, ymm5, 1 ; ymm5=(CbO * -FIX(0.22800)) + vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)] + vpaddw ymm1, ymm1, [GOTOFF(eax,PW_ONE)] + vpsraw ymm0, ymm0, 1 ; ymm0=(CrE * FIX(0.40200)) + vpsraw ymm1, ymm1, 1 ; ymm1=(CrO * FIX(0.40200)) + + vpaddw ymm4, ymm4, ymm2 + vpaddw ymm5, ymm5, ymm3 + vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbE * FIX(1.77200))=(B-Y)E + vpaddw ymm5, ymm5, ymm3 ; ymm5=(CbO * FIX(1.77200))=(B-Y)O + vpaddw ymm0, ymm0, ymm6 ; ymm0=(CrE * FIX(1.40200))=(R-Y)E + vpaddw ymm1, ymm1, ymm7 ; ymm1=(CrO * FIX(1.40200))=(R-Y)O + + vmovdqa YMMWORD [wk(0)], ymm4 ; wk(0)=(B-Y)E + vmovdqa YMMWORD [wk(1)], ymm5 ; wk(1)=(B-Y)O + + vpunpckhwd ymm4, ymm2, ymm6 + vpunpcklwd ymm2, ymm2, ymm6 + vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)] + vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF0344_F0285)] + vpunpckhwd ymm5, ymm3, ymm7 + vpunpcklwd ymm3, ymm3, ymm7 + vpmaddwd ymm3, ymm3, [GOTOFF(eax,PW_MF0344_F0285)] + vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)] + + vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)] + vpaddd ymm4, ymm4, [GOTOFF(eax,PD_ONEHALF)] + vpsrad ymm2, ymm2, SCALEBITS + vpsrad ymm4, ymm4, SCALEBITS + vpaddd ymm3, ymm3, [GOTOFF(eax,PD_ONEHALF)] + vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)] + vpsrad ymm3, ymm3, SCALEBITS + vpsrad ymm5, ymm5, SCALEBITS + + vpackssdw ymm2, ymm2, ymm4 ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + vpackssdw ymm3, ymm3, ymm5 ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + vpsubw ymm2, ymm2, ymm6 ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + vpsubw ymm3, ymm3, ymm7 ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + vmovdqu ymm5, YMMWORD [esi] ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpcmpeqw ymm4, ymm4, ymm4 + vpsrlw ymm4, ymm4, BYTE_BIT ; ymm4={0xFF 0x00 0xFF 0x00 ..} + vpand ymm4, ymm4, ymm5 ; ymm4=Y(02468ACEGIKMOQSU)=YE + vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Y(13579BDFHJLNPRTV)=YO + + vpaddw ymm0, ymm0, ymm4 ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU) + vpaddw ymm1, ymm1, ymm5 ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV) + vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********) + vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********) + + vpaddw ymm2, ymm2, ymm4 ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU) + vpaddw ymm3, ymm3, ymm5 ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV) + vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********) + vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********) + + vpaddw ymm4, ymm4, YMMWORD [wk(0)] ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU) + vpaddw ymm5, ymm5, YMMWORD [wk(1)] ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV) + vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********) + vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F + ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V) + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F + ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V) + + vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G + ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --) + vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F + ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07 + ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N) + + vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H + ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --) + + vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H + ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --) + vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G + ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --) + vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18 + ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O) + + vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H + ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --) + vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29 + ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P) + + vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03 + ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J) + vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14 + ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K) + vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29 + ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P) + vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07 + ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N) + + vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B + ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R) + vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C + ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S) + vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H + ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --) + vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V) + + vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + cmp ecx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test edi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub ecx, byte SIZEOF_YMMWORD + jz near .nextrow + + add esi, byte SIZEOF_YMMWORD ; inptr0 + add ebx, byte SIZEOF_YMMWORD ; inptr1 + add edx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st64: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_YMMWORD + jb short .column_st32 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + add edi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmF + sub ecx, byte 2*SIZEOF_YMMWORD + jmp short .column_st31 +.column_st32: + cmp ecx, byte SIZEOF_YMMWORD + jb short .column_st31 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + add edi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub ecx, byte SIZEOF_YMMWORD + jmp short .column_st31 +.column_st31: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub ecx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + vmovq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + vpsrldq xmmA, xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + vmovd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + vpsrldq xmmA, xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + vmovd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov word [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 +.column_st1: + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + mov byte [edi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%else + vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%endif + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **) + ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E + ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U) + vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F + ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V) + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F + ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V) + + vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E + ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36 + ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M) + vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F + ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V) + vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37 + ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N) + + vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + + vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + cmp ecx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test edi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC + vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC + vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH +.out0: + add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub ecx, byte SIZEOF_YMMWORD + jz near .nextrow + + add esi, byte SIZEOF_YMMWORD ; inptr0 + add ebx, byte SIZEOF_YMMWORD ; inptr1 + add edx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st64: + cmp ecx, byte SIZEOF_YMMWORD/2 + jb short .column_st32 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + add edi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmC + vmovdqa ymmD, ymmH + sub ecx, byte SIZEOF_YMMWORD/2 +.column_st32: + cmp ecx, byte SIZEOF_YMMWORD/4 + jb short .column_st16 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + add edi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub ecx, byte SIZEOF_YMMWORD/4 +.column_st16: + cmp ecx, byte SIZEOF_YMMWORD/8 + jb short .column_st15 + vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + vperm2i128 ymmA, ymmA, ymmA, 1 + add edi, byte SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_YMMWORD/8 +.column_st15: + ; Store two pixels (8 bytes) of ymmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_YMMWORD/16 + jb short .column_st7 + vmovq MMWORD [edi], xmmA + add edi, byte SIZEOF_YMMWORD/16*4 + sub ecx, byte SIZEOF_YMMWORD/16 + vpsrldq xmmA, SIZEOF_YMMWORD/16*4 +.column_st7: + ; Store one pixel (4 bytes) of ymmA to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + vmovd XMM_DWORD [edi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + + alignx 16, 7 + +.nextrow: + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jdcolext-mmx.asm b/media/libjpeg/simd/i386/jdcolext-mmx.asm new file mode 100644 index 0000000000..5813cfcb66 --- /dev/null +++ b/media/libjpeg/simd/i386/jdcolext-mmx.asm @@ -0,0 +1,404 @@ +; +; jdcolext.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf, +; JDIMENSION input_row, JSAMPARRAY output_buf, +; int num_rows) +; + +%define out_width(b) (b) + 8 ; JDIMENSION out_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define input_row(b) (b) + 16 ; JDIMENSION input_row +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx) + +EXTN(jsimd_ycc_rgb_convert_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16, 7 +.columnloop: + + movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) + movq mm1, MMWORD [edx] ; mm1=Cr(01234567) + + pcmpeqw mm4, mm4 + pcmpeqw mm7, mm7 + psrlw mm4, BYTE_BIT + psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} + movq mm0, mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..} + + pand mm4, mm5 ; mm4=Cb(0246)=CbE + psrlw mm5, BYTE_BIT ; mm5=Cb(1357)=CbO + pand mm0, mm1 ; mm0=Cr(0246)=CrE + psrlw mm1, BYTE_BIT ; mm1=Cr(1357)=CrO + + paddw mm4, mm7 + paddw mm5, mm7 + paddw mm0, mm7 + paddw mm1, mm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movq mm2, mm4 ; mm2=CbE + movq mm3, mm5 ; mm3=CbO + paddw mm4, mm4 ; mm4=2*CbE + paddw mm5, mm5 ; mm5=2*CbO + movq mm6, mm0 ; mm6=CrE + movq mm7, mm1 ; mm7=CrO + paddw mm0, mm0 ; mm0=2*CrE + paddw mm1, mm1 ; mm1=2*CrO + + pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800)) + pmulhw mm5, [GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800)) + pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200)) + pmulhw mm1, [GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200)) + + paddw mm4, [GOTOFF(eax,PW_ONE)] + paddw mm5, [GOTOFF(eax,PW_ONE)] + psraw mm4, 1 ; mm4=(CbE * -FIX(0.22800)) + psraw mm5, 1 ; mm5=(CbO * -FIX(0.22800)) + paddw mm0, [GOTOFF(eax,PW_ONE)] + paddw mm1, [GOTOFF(eax,PW_ONE)] + psraw mm0, 1 ; mm0=(CrE * FIX(0.40200)) + psraw mm1, 1 ; mm1=(CrO * FIX(0.40200)) + + paddw mm4, mm2 + paddw mm5, mm3 + paddw mm4, mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E + paddw mm5, mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O + paddw mm0, mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E + paddw mm1, mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O + + movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E + movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O + + movq mm4, mm2 + movq mm5, mm3 + punpcklwd mm2, mm6 + punpckhwd mm4, mm6 + pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm4, [GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd mm3, mm7 + punpckhwd mm5, mm7 + pmaddwd mm3, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)] + + paddd mm2, [GOTOFF(eax,PD_ONEHALF)] + paddd mm4, [GOTOFF(eax,PD_ONEHALF)] + psrad mm2, SCALEBITS + psrad mm4, SCALEBITS + paddd mm3, [GOTOFF(eax,PD_ONEHALF)] + paddd mm5, [GOTOFF(eax,PD_ONEHALF)] + psrad mm3, SCALEBITS + psrad mm5, SCALEBITS + + packssdw mm2, mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw mm3, mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw mm2, mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw mm3, mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movq mm5, MMWORD [esi] ; mm5=Y(01234567) + + pcmpeqw mm4, mm4 + psrlw mm4, BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..} + pand mm4, mm5 ; mm4=Y(0246)=YE + psrlw mm5, BYTE_BIT ; mm5=Y(1357)=YO + + paddw mm0, mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) + paddw mm1, mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) + packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) + packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) + + paddw mm2, mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) + paddw mm3, mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) + packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) + packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) + + paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) + paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) + packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) + packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) + + punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07) + punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27) + + movq mmG, mmA + movq mmH, mmA + punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03) + punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07) + + psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) + psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) + + movq mmC, mmD + movq mmB, mmD + punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14) + punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --) + + psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) + + movq mmF, mmE + punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25) + punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --) + + punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12) + punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05) + punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + + sub ecx, byte SIZEOF_MMWORD + jz short .nextrow + + add esi, byte SIZEOF_MMWORD ; inptr0 + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + jmp near .columnloop + alignx 16, 7 + +.column_st16: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_MMWORD + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq mmA, mmC + sub ecx, byte 2*SIZEOF_MMWORD + add edi, byte 2*SIZEOF_MMWORD + jmp short .column_st4 +.column_st8: + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA, mmE + sub ecx, byte SIZEOF_MMWORD + add edi, byte SIZEOF_MMWORD +.column_st4: + movd eax, mmA + cmp ecx, byte SIZEOF_DWORD + jb short .column_st2 + mov dword [edi+0*SIZEOF_DWORD], eax + psrlq mmA, DWORD_BIT + movd eax, mmA + sub ecx, byte SIZEOF_DWORD + add edi, byte SIZEOF_DWORD +.column_st2: + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov word [edi+0*SIZEOF_WORD], ax + shr eax, WORD_BIT + sub ecx, byte SIZEOF_WORD + add edi, byte SIZEOF_WORD +.column_st1: + cmp ecx, byte SIZEOF_BYTE + jb short .nextrow + mov byte [edi+0*SIZEOF_BYTE], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%else + pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%endif + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) + + punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36) + punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17) + punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37) + + movq mmC, mmA + punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32) + punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36) + movq mmG, mmB + punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33) + punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37) + + movq mmD, mmA + punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31) + punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33) + movq mmH, mmC + punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35) + punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + movq MMWORD [edi+3*SIZEOF_MMWORD], mmH + + sub ecx, byte SIZEOF_MMWORD + jz short .nextrow + + add esi, byte SIZEOF_MMWORD ; inptr0 + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + jmp near .columnloop + alignx 16, 7 + +.column_st16: + cmp ecx, byte SIZEOF_MMWORD/2 + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq mmA, mmC + movq mmD, mmH + sub ecx, byte SIZEOF_MMWORD/2 + add edi, byte 2*SIZEOF_MMWORD +.column_st8: + cmp ecx, byte SIZEOF_MMWORD/4 + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA, mmD + sub ecx, byte SIZEOF_MMWORD/4 + add edi, byte 1*SIZEOF_MMWORD +.column_st4: + cmp ecx, byte SIZEOF_MMWORD/8 + jb short .nextrow + movd dword [edi+0*SIZEOF_DWORD], mmA + +%endif ; RGB_PIXELSIZE ; --------------- + + alignx 16, 7 + +.nextrow: + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jdcolext-sse2.asm b/media/libjpeg/simd/i386/jdcolext-sse2.asm new file mode 100644 index 0000000000..d5572b3294 --- /dev/null +++ b/media/libjpeg/simd/i386/jdcolext-sse2.asm @@ -0,0 +1,458 @@ +; +; jdcolext.asm - colorspace conversion (SSE2) +; +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2012, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf, +; JDIMENSION input_row, JSAMPARRAY output_buf, +; int num_rows) +; + +%define out_width(b) (b) + 8 ; JDIMENSION out_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define input_row(b) (b) + 16 ; JDIMENSION input_row +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf +%define num_rows(b) (b) + 24 ; int num_rows + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2) + +EXTN(jsimd_ycc_rgb_convert_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 +.rowloop: + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16, 7 +.columnloop: + + movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4, xmm4 + pcmpeqw xmm7, xmm7 + psrlw xmm4, BYTE_BIT + psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4, xmm7 + paddw xmm5, xmm7 + paddw xmm0, xmm7 + paddw xmm1, xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2, xmm4 ; xmm2=CbE + movdqa xmm3, xmm5 ; xmm3=CbO + paddw xmm4, xmm4 ; xmm4=2*CbE + paddw xmm5, xmm5 ; xmm5=2*CbO + movdqa xmm6, xmm0 ; xmm6=CrE + movdqa xmm7, xmm1 ; xmm7=CrO + paddw xmm0, xmm0 ; xmm0=2*CrE + paddw xmm1, xmm1 ; xmm1=2*CrO + + pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4, [GOTOFF(eax,PW_ONE)] + paddw xmm5, [GOTOFF(eax,PW_ONE)] + psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0, [GOTOFF(eax,PW_ONE)] + paddw xmm1, [GOTOFF(eax,PW_ONE)] + psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4, xmm2 + paddw xmm5, xmm3 + paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4, xmm2 + movdqa xmm5, xmm3 + punpcklwd xmm2, xmm6 + punpckhwd xmm4, xmm6 + pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm3, xmm7 + punpckhwd xmm5, xmm7 + pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm2, [GOTOFF(eax,PD_ONEHALF)] + paddd xmm4, [GOTOFF(eax,PD_ONEHALF)] + psrad xmm2, SCALEBITS + psrad xmm4, SCALEBITS + paddd xmm3, [GOTOFF(eax,PD_ONEHALF)] + paddd xmm5, [GOTOFF(eax,PD_ONEHALF)] + psrad xmm3, SCALEBITS + psrad xmm5, SCALEBITS + + packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4, xmm4 + psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG, xmmA + movdqa xmmH, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC, xmmD + movdqa xmmB, xmmD + punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF, xmmE + punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB, xmmE + punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB, xmmF + punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow + + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st32: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub ecx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + movd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov word [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 +.column_st1: + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + mov byte [edi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG, xmmB + punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH, xmmC + punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow + + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st32: + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmC + movdqa xmmD, xmmH + sub ecx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub ecx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD/8*4 + sub ecx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + movd XMM_DWORD [edi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + + alignx 16, 7 + +.nextrow: + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jdcolor-avx2.asm b/media/libjpeg/simd/i386/jdcolor-avx2.asm new file mode 100644 index 0000000000..e05b60d001 --- /dev/null +++ b/media/libjpeg/simd/i386/jdcolor-avx2.asm @@ -0,0 +1,118 @@ +; +; jdcolor.asm - colorspace conversion (AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_ycc_rgb_convert_avx2) + +EXTN(jconst_ycc_rgb_convert_avx2): + +PW_F0402 times 16 dw F_0_402 +PW_MF0228 times 16 dw -F_0_228 +PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 +PW_ONE times 16 dw 1 +PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2 +%include "jdcolext-avx2.asm" diff --git a/media/libjpeg/simd/i386/jdcolor-mmx.asm b/media/libjpeg/simd/i386/jdcolor-mmx.asm new file mode 100644 index 0000000000..fb7e7bcce4 --- /dev/null +++ b/media/libjpeg/simd/i386/jdcolor-mmx.asm @@ -0,0 +1,117 @@ +; +; jdcolor.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_ycc_rgb_convert_mmx) + +EXTN(jconst_ycc_rgb_convert_mmx): + +PW_F0402 times 4 dw F_0_402 +PW_MF0228 times 4 dw -F_0_228 +PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 +PW_ONE times 4 dw 1 +PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx +%include "jdcolext-mmx.asm" diff --git a/media/libjpeg/simd/i386/jdcolor-sse2.asm b/media/libjpeg/simd/i386/jdcolor-sse2.asm new file mode 100644 index 0000000000..b736255317 --- /dev/null +++ b/media/libjpeg/simd/i386/jdcolor-sse2.asm @@ -0,0 +1,117 @@ +; +; jdcolor.asm - colorspace conversion (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_ycc_rgb_convert_sse2) + +EXTN(jconst_ycc_rgb_convert_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2 +%include "jdcolext-sse2.asm" diff --git a/media/libjpeg/simd/i386/jdmerge-avx2.asm b/media/libjpeg/simd/i386/jdmerge-avx2.asm new file mode 100644 index 0000000000..711e6792d0 --- /dev/null +++ b/media/libjpeg/simd/i386/jdmerge-avx2.asm @@ -0,0 +1,136 @@ +; +; jdmerge.asm - merged upsampling/color conversion (AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_merged_upsample_avx2) + +EXTN(jconst_merged_upsample_avx2): + +PW_F0402 times 16 dw F_0_402 +PW_MF0228 times 16 dw -F_0_228 +PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 +PW_ONE times 16 dw 1 +PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extrgb_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extrgb_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extrgbx_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extrgbx_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extbgr_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extbgr_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extbgrx_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extbgrx_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extxbgr_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extxbgr_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extxrgb_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extxrgb_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" diff --git a/media/libjpeg/simd/i386/jdmerge-mmx.asm b/media/libjpeg/simd/i386/jdmerge-mmx.asm new file mode 100644 index 0000000000..6e8311d408 --- /dev/null +++ b/media/libjpeg/simd/i386/jdmerge-mmx.asm @@ -0,0 +1,123 @@ +; +; jdmerge.asm - merged upsampling/color conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_merged_upsample_mmx) + +EXTN(jconst_merged_upsample_mmx): + +PW_F0402 times 4 dw F_0_402 +PW_MF0228 times 4 dw -F_0_228 +PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 +PW_ONE times 4 dw 1 +PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx +%include "jdmrgext-mmx.asm" diff --git a/media/libjpeg/simd/i386/jdmerge-sse2.asm b/media/libjpeg/simd/i386/jdmerge-sse2.asm new file mode 100644 index 0000000000..e32f90aa17 --- /dev/null +++ b/media/libjpeg/simd/i386/jdmerge-sse2.asm @@ -0,0 +1,135 @@ +; +; jdmerge.asm - merged upsampling/color conversion (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_merged_upsample_sse2) + +EXTN(jconst_merged_upsample_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extrgb_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extrgbx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extrgbx_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extbgr_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extbgrx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extbgrx_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extxbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extxbgr_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extxrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extxrgb_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" diff --git a/media/libjpeg/simd/i386/jdmrgext-avx2.asm b/media/libjpeg/simd/i386/jdmrgext-avx2.asm new file mode 100644 index 0000000000..e35f7282bc --- /dev/null +++ b/media/libjpeg/simd/i386/jdmrgext-avx2.asm @@ -0,0 +1,575 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (AVX2) +; +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2012, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b) + 8 ; JDIMENSION output_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD + ; ymmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2) + +EXTN(jsimd_h2v1_merged_upsample_avx2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16, 7 +.columnloop: + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + vmovdqu ymm6, YMMWORD [ebx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) + vmovdqu ymm7, YMMWORD [edx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) + vpcmpeqw ymm3, ymm3, ymm3 + vpsllw ymm3, ymm3, 7 ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpermq ymm6, ymm6, 0xd8 ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV) + vpermq ymm7, ymm7, 0xd8 ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV) + vpunpcklbw ymm4, ymm6, ymm1 ; ymm4=Cb(0123456789ABCDEF)=CbL + vpunpckhbw ymm6, ymm6, ymm1 ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH + vpunpcklbw ymm0, ymm7, ymm1 ; ymm0=Cr(0123456789ABCDEF)=CrL + vpunpckhbw ymm7, ymm7, ymm1 ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH + + vpaddw ymm5, ymm6, ymm3 + vpaddw ymm2, ymm4, ymm3 + vpaddw ymm1, ymm7, ymm3 + vpaddw ymm3, ymm0, ymm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + vpaddw ymm6, ymm5, ymm5 ; ymm6=2*CbH + vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbL + vpaddw ymm7, ymm1, ymm1 ; ymm7=2*CrH + vpaddw ymm0, ymm3, ymm3 ; ymm0=2*CrL + + vpmulhw ymm6, ymm6, [GOTOFF(eax,PW_MF0228)] ; ymm6=(2*CbH * -FIX(0.22800)) + vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbL * -FIX(0.22800)) + vpmulhw ymm7, ymm7, [GOTOFF(eax,PW_F0402)] ; ymm7=(2*CrH * FIX(0.40200)) + vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrL * FIX(0.40200)) + + vpaddw ymm6, ymm6, [GOTOFF(eax,PW_ONE)] + vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)] + vpsraw ymm6, ymm6, 1 ; ymm6=(CbH * -FIX(0.22800)) + vpsraw ymm4, ymm4, 1 ; ymm4=(CbL * -FIX(0.22800)) + vpaddw ymm7, ymm7, [GOTOFF(eax,PW_ONE)] + vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)] + vpsraw ymm7, ymm7, 1 ; ymm7=(CrH * FIX(0.40200)) + vpsraw ymm0, ymm0, 1 ; ymm0=(CrL * FIX(0.40200)) + + vpaddw ymm6, ymm6, ymm5 + vpaddw ymm4, ymm4, ymm2 + vpaddw ymm6, ymm6, ymm5 ; ymm6=(CbH * FIX(1.77200))=(B-Y)H + vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbL * FIX(1.77200))=(B-Y)L + vpaddw ymm7, ymm7, ymm1 ; ymm7=(CrH * FIX(1.40200))=(R-Y)H + vpaddw ymm0, ymm0, ymm3 ; ymm0=(CrL * FIX(1.40200))=(R-Y)L + + vmovdqa YMMWORD [wk(0)], ymm6 ; wk(0)=(B-Y)H + vmovdqa YMMWORD [wk(1)], ymm7 ; wk(1)=(R-Y)H + + vpunpckhwd ymm6, ymm5, ymm1 + vpunpcklwd ymm5, ymm5, ymm1 + vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)] + vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)] + vpunpckhwd ymm7, ymm2, ymm3 + vpunpcklwd ymm2, ymm2, ymm3 + vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)] + vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)] + + vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)] + vpaddd ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)] + vpsrad ymm5, ymm5, SCALEBITS + vpsrad ymm6, ymm6, SCALEBITS + vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)] + vpaddd ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)] + vpsrad ymm2, ymm2, SCALEBITS + vpsrad ymm7, ymm7, SCALEBITS + + vpackssdw ymm5, ymm5, ymm6 ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + vpackssdw ymm2, ymm2, ymm7 ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + vpsubw ymm5, ymm5, ymm1 ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + vpsubw ymm2, ymm2, ymm3 ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + vmovdqa YMMWORD [wk(2)], ymm5 ; wk(2)=(G-Y)H + + mov al, 2 ; Yctr + jmp short .Yloop_1st + alignx 16, 7 + +.Yloop_2nd: + vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H + vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H + vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H + alignx 16, 7 + +.Yloop_1st: + vmovdqu ymm7, YMMWORD [esi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpcmpeqw ymm6, ymm6, ymm6 + vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} + vpand ymm6, ymm6, ymm7 ; ymm6=Y(02468ACEGIKMOQSU)=YE + vpsrlw ymm7, ymm7, BYTE_BIT ; ymm7=Y(13579BDFHJLNPRTV)=YO + + vmovdqa ymm1, ymm0 ; ymm1=ymm0=(R-Y)(L/H) + vmovdqa ymm3, ymm2 ; ymm3=ymm2=(G-Y)(L/H) + vmovdqa ymm5, ymm4 ; ymm5=ymm4=(B-Y)(L/H) + + vpaddw ymm0, ymm0, ymm6 ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU) + vpaddw ymm1, ymm1, ymm7 ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV) + vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********) + vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********) + + vpaddw ymm2, ymm2, ymm6 ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU) + vpaddw ymm3, ymm3, ymm7 ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV) + vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********) + vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********) + + vpaddw ymm4, ymm4, ymm6 ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU) + vpaddw ymm5, ymm5, ymm7 ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV) + vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********) + vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F + ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V) + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F + ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V) + + vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G + ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --) + vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F + ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07 + ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N) + + vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H + ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --) + + vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H + ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --) + vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G + ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --) + vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18 + ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O) + + vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H + ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --) + vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29 + ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P) + + vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03 + ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J) + vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14 + ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K) + vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29 + ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P) + vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07 + ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N) + + vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B + ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R) + vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C + ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S) + vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H + ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --) + vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V) + + vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + cmp ecx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test edi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub ecx, byte SIZEOF_YMMWORD + jz near .endcolumn + + add esi, byte SIZEOF_YMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_YMMWORD ; inptr1 + add edx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st64: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_YMMWORD + jb short .column_st32 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + add edi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmF + sub ecx, byte 2*SIZEOF_YMMWORD + jmp short .column_st31 +.column_st32: + cmp ecx, byte SIZEOF_YMMWORD + jb short .column_st31 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + add edi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub ecx, byte SIZEOF_YMMWORD + jmp short .column_st31 +.column_st31: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub ecx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + vmovq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + vpsrldq xmmA, xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + vmovd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + vpsrldq xmmA, xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + vmovd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov word [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 +.column_st1: + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + mov byte [edi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%else + vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%endif + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **) + ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E + ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U) + vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F + ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V) + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F + ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V) + + vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E + ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36 + ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M) + vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F + ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V) + vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37 + ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N) + + vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + + vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + cmp ecx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test edi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC + vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC + vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH +.out0: + add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub ecx, byte SIZEOF_YMMWORD + jz near .endcolumn + + add esi, byte SIZEOF_YMMWORD ; inptr0 + dec al + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_YMMWORD ; inptr1 + add edx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st64: + cmp ecx, byte SIZEOF_YMMWORD/2 + jb short .column_st32 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD + add edi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmC + vmovdqa ymmD, ymmH + sub ecx, byte SIZEOF_YMMWORD/2 +.column_st32: + cmp ecx, byte SIZEOF_YMMWORD/4 + jb short .column_st16 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA + add edi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub ecx, byte SIZEOF_YMMWORD/4 +.column_st16: + cmp ecx, byte SIZEOF_YMMWORD/8 + jb short .column_st15 + vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub ecx, byte SIZEOF_YMMWORD/8 +.column_st15: + ; Store two pixels (8 bytes) of ymmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_YMMWORD/16 + jb short .column_st7 + vmovq MMWORD [edi], xmmA + add edi, byte SIZEOF_YMMWORD/16*4 + sub ecx, byte SIZEOF_YMMWORD/16 + vpsrldq xmmA, SIZEOF_YMMWORD/16*4 +.column_st7: + ; Store one pixel (4 bytes) of ymmA to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + vmovd XMM_DWORD [edi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + sfence ; flush the write buffer + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b) + 8 ; JDIMENSION output_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2) + +EXTN(jsimd_h2v2_merged_upsample_avx2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, POINTER [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx, esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_avx2) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_avx2) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jdmrgext-mmx.asm b/media/libjpeg/simd/i386/jdmrgext-mmx.asm new file mode 100644 index 0000000000..eb3e36b475 --- /dev/null +++ b/media/libjpeg/simd/i386/jdmrgext-mmx.asm @@ -0,0 +1,460 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b) + 8 ; JDIMENSION output_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx) + +EXTN(jsimd_h2v1_merged_upsample_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16, 7 +.columnloop: + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + movq mm6, MMWORD [ebx] ; mm6=Cb(01234567) + movq mm7, MMWORD [edx] ; mm7=Cr(01234567) + + pxor mm1, mm1 ; mm1=(all 0's) + pcmpeqw mm3, mm3 + psllw mm3, 7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80} + + movq mm4, mm6 + punpckhbw mm6, mm1 ; mm6=Cb(4567)=CbH + punpcklbw mm4, mm1 ; mm4=Cb(0123)=CbL + movq mm0, mm7 + punpckhbw mm7, mm1 ; mm7=Cr(4567)=CrH + punpcklbw mm0, mm1 ; mm0=Cr(0123)=CrL + + paddw mm6, mm3 + paddw mm4, mm3 + paddw mm7, mm3 + paddw mm0, mm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movq mm5, mm6 ; mm5=CbH + movq mm2, mm4 ; mm2=CbL + paddw mm6, mm6 ; mm6=2*CbH + paddw mm4, mm4 ; mm4=2*CbL + movq mm1, mm7 ; mm1=CrH + movq mm3, mm0 ; mm3=CrL + paddw mm7, mm7 ; mm7=2*CrH + paddw mm0, mm0 ; mm0=2*CrL + + pmulhw mm6, [GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800)) + pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800)) + pmulhw mm7, [GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200)) + pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200)) + + paddw mm6, [GOTOFF(eax,PW_ONE)] + paddw mm4, [GOTOFF(eax,PW_ONE)] + psraw mm6, 1 ; mm6=(CbH * -FIX(0.22800)) + psraw mm4, 1 ; mm4=(CbL * -FIX(0.22800)) + paddw mm7, [GOTOFF(eax,PW_ONE)] + paddw mm0, [GOTOFF(eax,PW_ONE)] + psraw mm7, 1 ; mm7=(CrH * FIX(0.40200)) + psraw mm0, 1 ; mm0=(CrL * FIX(0.40200)) + + paddw mm6, mm5 + paddw mm4, mm2 + paddw mm6, mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H + paddw mm4, mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L + paddw mm7, mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H + paddw mm0, mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L + + movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H + movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H + + movq mm6, mm5 + movq mm7, mm2 + punpcklwd mm5, mm1 + punpckhwd mm6, mm1 + pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm6, [GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd mm2, mm3 + punpckhwd mm7, mm3 + pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm7, [GOTOFF(eax,PW_MF0344_F0285)] + + paddd mm5, [GOTOFF(eax,PD_ONEHALF)] + paddd mm6, [GOTOFF(eax,PD_ONEHALF)] + psrad mm5, SCALEBITS + psrad mm6, SCALEBITS + paddd mm2, [GOTOFF(eax,PD_ONEHALF)] + paddd mm7, [GOTOFF(eax,PD_ONEHALF)] + psrad mm2, SCALEBITS + psrad mm7, SCALEBITS + + packssdw mm5, mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw mm2, mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw mm5, mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw mm2, mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H + + mov al, 2 ; Yctr + jmp short .Yloop_1st + alignx 16, 7 + +.Yloop_2nd: + movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H + movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H + movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H + alignx 16, 7 + +.Yloop_1st: + movq mm7, MMWORD [esi] ; mm7=Y(01234567) + + pcmpeqw mm6, mm6 + psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + pand mm6, mm7 ; mm6=Y(0246)=YE + psrlw mm7, BYTE_BIT ; mm7=Y(1357)=YO + + movq mm1, mm0 ; mm1=mm0=(R-Y)(L/H) + movq mm3, mm2 ; mm3=mm2=(G-Y)(L/H) + movq mm5, mm4 ; mm5=mm4=(B-Y)(L/H) + + paddw mm0, mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6) + paddw mm1, mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7) + packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) + packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) + + paddw mm2, mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6) + paddw mm3, mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7) + packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) + packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) + + paddw mm4, mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6) + paddw mm5, mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7) + packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) + packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) + + punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07) + punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27) + + movq mmG, mmA + movq mmH, mmA + punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03) + punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07) + + psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) + psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) + + movq mmC, mmD + movq mmB, mmD + punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14) + punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --) + + psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) + + movq mmF, mmE + punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25) + punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --) + + punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12) + punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05) + punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + + sub ecx, byte SIZEOF_MMWORD + jz near .endcolumn + + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + add esi, byte SIZEOF_MMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st16: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_MMWORD + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq mmA, mmC + sub ecx, byte 2*SIZEOF_MMWORD + add edi, byte 2*SIZEOF_MMWORD + jmp short .column_st4 +.column_st8: + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA, mmE + sub ecx, byte SIZEOF_MMWORD + add edi, byte SIZEOF_MMWORD +.column_st4: + movd eax, mmA + cmp ecx, byte SIZEOF_DWORD + jb short .column_st2 + mov dword [edi+0*SIZEOF_DWORD], eax + psrlq mmA, DWORD_BIT + movd eax, mmA + sub ecx, byte SIZEOF_DWORD + add edi, byte SIZEOF_DWORD +.column_st2: + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov word [edi+0*SIZEOF_WORD], ax + shr eax, WORD_BIT + sub ecx, byte SIZEOF_WORD + add edi, byte SIZEOF_WORD +.column_st1: + cmp ecx, byte SIZEOF_BYTE + jb short .endcolumn + mov byte [edi+0*SIZEOF_BYTE], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%else + pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%endif + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) + + punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36) + punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17) + punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37) + + movq mmC, mmA + punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32) + punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36) + movq mmG, mmB + punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33) + punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37) + + movq mmD, mmA + punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31) + punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33) + movq mmH, mmC + punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35) + punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + movq MMWORD [edi+3*SIZEOF_MMWORD], mmH + + sub ecx, byte SIZEOF_MMWORD + jz short .endcolumn + + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + add esi, byte SIZEOF_MMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st16: + cmp ecx, byte SIZEOF_MMWORD/2 + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq mmA, mmC + movq mmD, mmH + sub ecx, byte SIZEOF_MMWORD/2 + add edi, byte 2*SIZEOF_MMWORD +.column_st8: + cmp ecx, byte SIZEOF_MMWORD/4 + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA, mmD + sub ecx, byte SIZEOF_MMWORD/4 + add edi, byte 1*SIZEOF_MMWORD +.column_st4: + cmp ecx, byte SIZEOF_MMWORD/8 + jb short .endcolumn + movd dword [edi+0*SIZEOF_DWORD], mmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b) + 8 ; JDIMENSION output_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx) + +EXTN(jsimd_h2v2_merged_upsample_mmx): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, JDIMENSION [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx, esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_mmx) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_mmx) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jdmrgext-sse2.asm b/media/libjpeg/simd/i386/jdmrgext-sse2.asm new file mode 100644 index 0000000000..c113dc4d27 --- /dev/null +++ b/media/libjpeg/simd/i386/jdmrgext-sse2.asm @@ -0,0 +1,517 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (SSE2) +; +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2012, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b) + 8 ; JDIMENSION output_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2) + +EXTN(jsimd_h2v1_merged_upsample_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16, 7 +.columnloop: + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) + movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) + + pxor xmm1, xmm1 ; xmm1=(all 0's) + pcmpeqw xmm3, xmm3 + psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + movdqa xmm4, xmm6 + punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH + punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL + movdqa xmm0, xmm7 + punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH + punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL + + paddw xmm6, xmm3 + paddw xmm4, xmm3 + paddw xmm7, xmm3 + paddw xmm0, xmm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm5, xmm6 ; xmm5=CbH + movdqa xmm2, xmm4 ; xmm2=CbL + paddw xmm6, xmm6 ; xmm6=2*CbH + paddw xmm4, xmm4 ; xmm4=2*CbL + movdqa xmm1, xmm7 ; xmm1=CrH + movdqa xmm3, xmm0 ; xmm3=CrL + paddw xmm7, xmm7 ; xmm7=2*CrH + paddw xmm0, xmm0 ; xmm0=2*CrL + + pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) + pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) + pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) + pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) + + paddw xmm6, [GOTOFF(eax,PW_ONE)] + paddw xmm4, [GOTOFF(eax,PW_ONE)] + psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800)) + psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800)) + paddw xmm7, [GOTOFF(eax,PW_ONE)] + paddw xmm0, [GOTOFF(eax,PW_ONE)] + psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200)) + psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200)) + + paddw xmm6, xmm5 + paddw xmm4, xmm2 + paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H + paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L + paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H + paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L + + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H + + movdqa xmm6, xmm5 + movdqa xmm7, xmm2 + punpcklwd xmm5, xmm1 + punpckhwd xmm6, xmm1 + pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm2, xmm3 + punpckhwd xmm7, xmm3 + pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm5, [GOTOFF(eax,PD_ONEHALF)] + paddd xmm6, [GOTOFF(eax,PD_ONEHALF)] + psrad xmm5, SCALEBITS + psrad xmm6, SCALEBITS + paddd xmm2, [GOTOFF(eax,PD_ONEHALF)] + paddd xmm7, [GOTOFF(eax,PD_ONEHALF)] + psrad xmm2, SCALEBITS + psrad xmm7, SCALEBITS + + packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H + + mov al, 2 ; Yctr + jmp short .Yloop_1st + alignx 16, 7 + +.Yloop_2nd: + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H + alignx 16, 7 + +.Yloop_1st: + movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) + + pcmpeqw xmm6, xmm6 + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE + psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO + + movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H) + movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H) + movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H) + + paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) + paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) + packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) + paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) + packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) + paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) + packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG, xmmA + movdqa xmmH, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC, xmmD + movdqa xmmB, xmmD + punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF, xmmE + punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB, xmmE + punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB, xmmF + punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add esi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st32: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub ecx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + movd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov word [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 +.column_st1: + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + mov byte [edi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG, xmmB + punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH, xmmC + punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add esi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 + +.column_st32: + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmC + movdqa xmmD, xmmH + sub ecx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub ecx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD/8*4 + sub ecx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + movd XMM_DWORD [edi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + sfence ; flush the write buffer + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b) + 8 ; JDIMENSION output_width +%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2) + +EXTN(jsimd_h2v2_merged_upsample_sse2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, POINTER [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx, esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_sse2) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_sse2) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jdsample-avx2.asm b/media/libjpeg/simd/i386/jdsample-avx2.asm new file mode 100644 index 0000000000..a800c35e08 --- /dev/null +++ b/media/libjpeg/simd/i386/jdsample-avx2.asm @@ -0,0 +1,760 @@ +; +; jdsample.asm - upsampling (AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2015, Intel Corporation. +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fancy_upsample_avx2) + +EXTN(jconst_fancy_upsample_avx2): + +PW_ONE times 16 dw 1 +PW_TWO times 16 dw 2 +PW_THREE times 16 dw 3 +PW_SEVEN times 16 dw 7 +PW_EIGHT times 16 dw 8 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2) + +EXTN(jsimd_h2v1_fancy_upsample_avx2): + push ebp + mov ebp, esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push eax ; colctr + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + test eax, SIZEOF_YMMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) + vpcmpeqb xmm7, xmm7, xmm7 + vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff + vpand ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD] + + add eax, byte SIZEOF_YMMWORD-1 + and eax, byte -SIZEOF_YMMWORD + cmp eax, byte SIZEOF_YMMWORD + ja short .columnloop + alignx 16, 7 + +.columnloop_last: + vpcmpeqb xmm6, xmm6, xmm6 + vpslldq xmm6, xmm6, (SIZEOF_XMMWORD-1) + vperm2i128 ymm6, ymm6, ymm6, 1 ; (---- ---- ... ---- ---- ff) MSB is ff + vpand ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD] + jmp short .upsample + alignx 16, 7 + +.columnloop: + vmovdqu ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD] + vperm2i128 ymm6, ymm0, ymm6, 0x20 + vpslldq ymm6, ymm6, 15 + +.upsample: + vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31) + + vperm2i128 ymm2, ymm0, ymm1, 0x20 + vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30) + vperm2i128 ymm4, ymm0, ymm1, 0x03 + vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --) + + vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30) + vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32) + + vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --) + + vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30) + vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22) + vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24) + vpunpcklbw ymm0, ymm3, ymm0 ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32) + vperm2i128 ymm3, ymm0, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) + vperm2i128 ymm6, ymm0, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) + + vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) + + vpmullw ymm1, ymm1, [GOTOFF(ebx,PW_THREE)] + vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] + vpaddw ymm2, ymm2, [GOTOFF(ebx,PW_ONE)] + vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_ONE)] + vpaddw ymm3, ymm3, [GOTOFF(ebx,PW_TWO)] + vpaddw ymm6, ymm6, [GOTOFF(ebx,PW_TWO)] + + vpaddw ymm2, ymm2, ymm1 + vpaddw ymm5, ymm5, ymm4 + vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) + vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) + vpaddw ymm3, ymm3, ymm1 + vpaddw ymm6, ymm6, ymm4 + vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) + vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) + + vpsllw ymm3, ymm3, BYTE_BIT + vpsllw ymm6, ymm6, BYTE_BIT + vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31) + vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63) + + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2 + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5 + + sub eax, byte SIZEOF_YMMWORD + add esi, byte 1*SIZEOF_YMMWORD ; inptr + add edi, byte 2*SIZEOF_YMMWORD ; outptr + cmp eax, byte SIZEOF_YMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD + ; ymmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2) + +EXTN(jsimd_h2v2_fancy_upsample_avx2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx, eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_YMMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx +.skip: + ; -- process the first column block + + vmovdqu ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0] + vmovdqu ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0] + vmovdqu ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's) + + vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm3, ymm2, ymm3 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)] + vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] + + vpcmpeqb xmm7, xmm7, xmm7 + vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff + + vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save + vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2 + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6 + + vpand ymm1, ymm1, ymm7 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vpand ymm2, ymm2, ymm7 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + + vmovdqa YMMWORD [wk(0)], ymm1 + vmovdqa YMMWORD [wk(1)], ymm2 + + poppic ebx + + add eax, byte SIZEOF_YMMWORD-1 + and eax, byte -SIZEOF_YMMWORD + cmp eax, byte SIZEOF_YMMWORD + ja short .columnloop + alignx 16, 7 + +.columnloop_last: + ; -- process the last column block + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + vpcmpeqb xmm1, xmm1, xmm1 + vpslldq xmm1, xmm1, (SIZEOF_XMMWORD-2) + vperm2i128 ymm1, ymm1, ymm1, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff + + vpand ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD] + vpand ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD] + + vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) + vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) + + jmp near .upsample + alignx 16, 7 + +.columnloop: + ; -- process the next column block + + vmovdqu ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1] + vmovdqu ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1] + vmovdqu ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's) + + vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm7, ymm2, ymm3 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)] + vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] + + vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vmovdqu YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save + vmovdqu YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data + vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2 + vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6 + + vperm2i128 ymm1, ymm3, ymm1, 0x20 + vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) + vperm2i128 ymm2, ymm3, ymm2, 0x20 + vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) + + vmovdqa YMMWORD [wk(2)], ymm1 + vmovdqa YMMWORD [wk(3)], ymm2 + +.upsample: + ; -- process the upper row + + vmovdqu ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vmovdqu ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) + + vperm2i128 ymm0, ymm1, ymm7, 0x03 + vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) + vperm2i128 ymm4, ymm1, ymm3, 0x20 + vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) + + vperm2i128 ymm5, ymm1, ymm7, 0x03 + vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm6, ymm1, ymm3, 0x20 + vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) + vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vperm2i128 ymm2, ymm1, ymm3, 0x03 + vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) + vperm2i128 ymm4, ymm1, ymm3, 0x03 + vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm1, ymm1, ymm7, 0x20 + vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + + vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) + + vmovdqa YMMWORD [wk(0)], ymm4 + + vpmullw ymm7, ymm7, [GOTOFF(ebx,PW_THREE)] + vpmullw ymm3, ymm3, [GOTOFF(ebx,PW_THREE)] + vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)] + vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)] + vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)] + vpaddw ymm2, [GOTOFF(ebx,PW_SEVEN)] + + vpaddw ymm1, ymm1, ymm7 + vpaddw ymm5, ymm5, ymm3 + vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) + vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm2, ymm2, ymm3 + vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) + vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) + + vpsllw ymm0, ymm0, BYTE_BIT + vpsllw ymm2, ymm2, BYTE_BIT + vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31) + vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63) + + vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 + vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 + + ; -- process the lower row + + vmovdqu ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vmovdqu ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) + + vperm2i128 ymm7, ymm1, ymm6, 0x03 + vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) + vperm2i128 ymm3, ymm1, ymm4, 0x20 + vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) + + vperm2i128 ymm0, ymm1, ymm6, 0x03 + vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm2, ymm1, ymm4, 0x20 + vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) + vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vperm2i128 ymm5, ymm1, ymm4, 0x03 + vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) + vperm2i128 ymm3, ymm1, ymm4, 0x03 + vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm1, ymm1, ymm6, 0x20 + vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + + vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) + + vmovdqa YMMWORD [wk(1)], ymm3 + + vpmullw ymm6, ymm6, [GOTOFF(ebx,PW_THREE)] + vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] + vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)] + vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)] + vpaddw ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)] + vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)] + + vpaddw ymm1, ymm1, ymm6 + vpaddw ymm0, ymm0, ymm4 + vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) + vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) + vpaddw ymm7, ymm7, ymm6 + vpaddw ymm5, ymm5, ymm4 + vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) + vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) + + vpsllw ymm7, ymm7, BYTE_BIT + vpsllw ymm5, ymm5, BYTE_BIT + vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31) + vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63) + + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1 + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0 + + poppic ebx + + sub eax, byte SIZEOF_YMMWORD + add ecx, byte 1*SIZEOF_YMMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_YMMWORD ; inptr0 + add esi, byte 1*SIZEOF_YMMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_YMMWORD ; outptr0 + add edi, byte 2*SIZEOF_YMMWORD ; outptr1 + cmp eax, byte SIZEOF_YMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define output_width(b) (b) + 12 ; JDIMENSION output_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2) + +EXTN(jsimd_h2v1_upsample_avx2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (SIZEOF_YMMWORD-1) + and edx, -SIZEOF_YMMWORD + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax, edx ; colctr + alignx 16, 7 +.columnloop: + + cmp eax, byte SIZEOF_YMMWORD + ja near .above_16 + + vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD] + vpunpckhbw xmm1, xmm0, xmm0 + vpunpcklbw xmm0, xmm0, xmm0 + + vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + jmp short .nextrow + +.above_16: + vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] + + vpermq ymm0, ymm0, 0xd8 + vpunpckhbw ymm1, ymm0, ymm0 + vpunpcklbw ymm0, ymm0, ymm0 + + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1 + + sub eax, byte 2*SIZEOF_YMMWORD + jz short .nextrow + + add esi, byte SIZEOF_YMMWORD ; inptr + add edi, byte 2*SIZEOF_YMMWORD ; outptr + jmp short .columnloop + alignx 16, 7 + +.nextrow: + pop esi + pop edi + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define output_width(b) (b) + 12 ; JDIMENSION output_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2) + +EXTN(jsimd_h2v2_upsample_avx2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (SIZEOF_YMMWORD-1) + and edx, -SIZEOF_YMMWORD + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax, edx ; colctr + alignx 16, 7 +.columnloop: + + cmp eax, byte SIZEOF_YMMWORD + ja short .above_16 + + vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + vpunpckhbw xmm1, xmm0, xmm0 + vpunpcklbw xmm0, xmm0, xmm0 + + vmovdqu XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 + vmovdqu XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 + vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + jmp near .nextrow + +.above_16: + vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] + + vpermq ymm0, ymm0, 0xd8 + vpunpckhbw ymm1, ymm0, ymm0 + vpunpcklbw ymm0, ymm0, ymm0 + + vmovdqu YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0 + vmovdqu YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1 + vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 + vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1 + + sub eax, byte 2*SIZEOF_YMMWORD + jz short .nextrow + + add esi, byte SIZEOF_YMMWORD ; inptr + add ebx, 2*SIZEOF_YMMWORD ; outptr0 + add edi, 2*SIZEOF_YMMWORD ; outptr1 + jmp short .columnloop + alignx 16, 7 + +.nextrow: + pop esi + pop edi + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + +.return: + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jdsample-mmx.asm b/media/libjpeg/simd/i386/jdsample-mmx.asm new file mode 100644 index 0000000000..12c49f0eab --- /dev/null +++ b/media/libjpeg/simd/i386/jdsample-mmx.asm @@ -0,0 +1,731 @@ +; +; jdsample.asm - upsampling (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fancy_upsample_mmx) + +EXTN(jconst_fancy_upsample_mmx): + +PW_ONE times 4 dw 1 +PW_TWO times 4 dw 2 +PW_THREE times 4 dw 3 +PW_SEVEN times 4 dw 7 +PW_EIGHT times 4 dw 8 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx) + +EXTN(jsimd_h2v1_fancy_upsample_mmx): + push ebp + mov ebp, esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push eax ; colctr + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + test eax, SIZEOF_MMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + pxor mm0, mm0 ; mm0=(all 0's) + pcmpeqb mm7, mm7 + psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT + pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] + + add eax, byte SIZEOF_MMWORD-1 + and eax, byte -SIZEOF_MMWORD + cmp eax, byte SIZEOF_MMWORD + ja short .columnloop + alignx 16, 7 + +.columnloop_last: + pcmpeqb mm6, mm6 + psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT + pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] + jmp short .upsample + alignx 16, 7 + +.columnloop: + movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] + psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT + +.upsample: + movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm2, mm1 + movq mm3, mm1 ; mm1=( 0 1 2 3 4 5 6 7) + psllq mm2, BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) + psrlq mm3, BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) + + por mm2, mm7 ; mm2=(-1 0 1 2 3 4 5 6) + por mm3, mm6 ; mm3=( 1 2 3 4 5 6 7 8) + + movq mm7, mm1 + psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) + + movq mm4, mm1 + punpcklbw mm1, mm0 ; mm1=( 0 1 2 3) + punpckhbw mm4, mm0 ; mm4=( 4 5 6 7) + movq mm5, mm2 + punpcklbw mm2, mm0 ; mm2=(-1 0 1 2) + punpckhbw mm5, mm0 ; mm5=( 3 4 5 6) + movq mm6, mm3 + punpcklbw mm3, mm0 ; mm3=( 1 2 3 4) + punpckhbw mm6, mm0 ; mm6=( 5 6 7 8) + + pmullw mm1, [GOTOFF(ebx,PW_THREE)] + pmullw mm4, [GOTOFF(ebx,PW_THREE)] + paddw mm2, [GOTOFF(ebx,PW_ONE)] + paddw mm5, [GOTOFF(ebx,PW_ONE)] + paddw mm3, [GOTOFF(ebx,PW_TWO)] + paddw mm6, [GOTOFF(ebx,PW_TWO)] + + paddw mm2, mm1 + paddw mm5, mm4 + psrlw mm2, 2 ; mm2=OutLE=( 0 2 4 6) + psrlw mm5, 2 ; mm5=OutHE=( 8 10 12 14) + paddw mm3, mm1 + paddw mm6, mm4 + psrlw mm3, 2 ; mm3=OutLO=( 1 3 5 7) + psrlw mm6, 2 ; mm6=OutHO=( 9 11 13 15) + + psllw mm3, BYTE_BIT + psllw mm6, BYTE_BIT + por mm2, mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) + por mm5, mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 + + sub eax, byte SIZEOF_MMWORD + add esi, byte 1*SIZEOF_MMWORD ; inptr + add edi, byte 2*SIZEOF_MMWORD ; outptr + cmp eax, byte SIZEOF_MMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx) + +EXTN(jsimd_h2v2_fancy_upsample_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx, eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_MMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx +.skip: + ; -- process the first column block + + movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] + movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] + movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor mm3, mm3 ; mm3=(all 0's) + movq mm4, mm0 + punpcklbw mm0, mm3 ; mm0=row[ 0][0]( 0 1 2 3) + punpckhbw mm4, mm3 ; mm4=row[ 0][0]( 4 5 6 7) + movq mm5, mm1 + punpcklbw mm1, mm3 ; mm1=row[-1][0]( 0 1 2 3) + punpckhbw mm5, mm3 ; mm5=row[-1][0]( 4 5 6 7) + movq mm6, mm2 + punpcklbw mm2, mm3 ; mm2=row[+1][0]( 0 1 2 3) + punpckhbw mm6, mm3 ; mm6=row[+1][0]( 4 5 6 7) + + pmullw mm0, [GOTOFF(ebx,PW_THREE)] + pmullw mm4, [GOTOFF(ebx,PW_THREE)] + + pcmpeqb mm7, mm7 + psrlq mm7, (SIZEOF_MMWORD-2)*BYTE_BIT + + paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3) + paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7) + paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3) + paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7) + + movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save + movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data + movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 + + pand mm1, mm7 ; mm1=( 0 - - -) + pand mm2, mm7 ; mm2=( 0 - - -) + + movq MMWORD [wk(0)], mm1 + movq MMWORD [wk(1)], mm2 + + poppic ebx + + add eax, byte SIZEOF_MMWORD-1 + and eax, byte -SIZEOF_MMWORD + cmp eax, byte SIZEOF_MMWORD + ja short .columnloop + alignx 16, 7 + +.columnloop_last: + ; -- process the last column block + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pcmpeqb mm1, mm1 + psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT + movq mm2, mm1 + + pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) + pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) + + movq MMWORD [wk(2)], mm1 + movq MMWORD [wk(3)], mm2 + + jmp short .upsample + alignx 16, 7 + +.columnloop: + ; -- process the next column block + + movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] + movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor mm3, mm3 ; mm3=(all 0's) + movq mm4, mm0 + punpcklbw mm0, mm3 ; mm0=row[ 0][1]( 0 1 2 3) + punpckhbw mm4, mm3 ; mm4=row[ 0][1]( 4 5 6 7) + movq mm5, mm1 + punpcklbw mm1, mm3 ; mm1=row[-1][1]( 0 1 2 3) + punpckhbw mm5, mm3 ; mm5=row[-1][1]( 4 5 6 7) + movq mm6, mm2 + punpcklbw mm2, mm3 ; mm2=row[+1][1]( 0 1 2 3) + punpckhbw mm6, mm3 ; mm6=row[+1][1]( 4 5 6 7) + + pmullw mm0, [GOTOFF(ebx,PW_THREE)] + pmullw mm4, [GOTOFF(ebx,PW_THREE)] + + paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3) + paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7) + paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3) + paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7) + + movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save + movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 + + psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) + psllq mm2, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) + + movq MMWORD [wk(2)], mm1 + movq MMWORD [wk(3)], mm2 + +.upsample: + ; -- process the upper row + + movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) + movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) + + movq mm0, mm7 + movq mm4, mm3 + psrlq mm0, 2*BYTE_BIT ; mm0=( 1 2 3 -) + psllq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) + movq mm5, mm7 + movq mm6, mm3 + psrlq mm5, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) + psllq mm6, 2*BYTE_BIT ; mm6=( - 4 5 6) + + por mm0, mm4 ; mm0=( 1 2 3 4) + por mm5, mm6 ; mm5=( 3 4 5 6) + + movq mm1, mm7 + movq mm2, mm3 + psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2) + psrlq mm2, 2*BYTE_BIT ; mm2=( 5 6 7 -) + movq mm4, mm3 + psrlq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) + + por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) + por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) + + movq MMWORD [wk(0)], mm4 + + pmullw mm7, [GOTOFF(ebx,PW_THREE)] + pmullw mm3, [GOTOFF(ebx,PW_THREE)] + paddw mm1, [GOTOFF(ebx,PW_EIGHT)] + paddw mm5, [GOTOFF(ebx,PW_EIGHT)] + paddw mm0, [GOTOFF(ebx,PW_SEVEN)] + paddw mm2, [GOTOFF(ebx,PW_SEVEN)] + + paddw mm1, mm7 + paddw mm5, mm3 + psrlw mm1, 4 ; mm1=Out0LE=( 0 2 4 6) + psrlw mm5, 4 ; mm5=Out0HE=( 8 10 12 14) + paddw mm0, mm7 + paddw mm2, mm3 + psrlw mm0, 4 ; mm0=Out0LO=( 1 3 5 7) + psrlw mm2, 4 ; mm2=Out0HO=( 9 11 13 15) + + psllw mm0, BYTE_BIT + psllw mm2, BYTE_BIT + por mm1, mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) + por mm5, mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 + movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 + + ; -- process the lower row + + movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) + movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) + + movq mm7, mm6 + movq mm3, mm4 + psrlq mm7, 2*BYTE_BIT ; mm7=( 1 2 3 -) + psllq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) + movq mm0, mm6 + movq mm2, mm4 + psrlq mm0, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) + psllq mm2, 2*BYTE_BIT ; mm2=( - 4 5 6) + + por mm7, mm3 ; mm7=( 1 2 3 4) + por mm0, mm2 ; mm0=( 3 4 5 6) + + movq mm1, mm6 + movq mm5, mm4 + psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2) + psrlq mm5, 2*BYTE_BIT ; mm5=( 5 6 7 -) + movq mm3, mm4 + psrlq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) + + por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) + por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) + + movq MMWORD [wk(1)], mm3 + + pmullw mm6, [GOTOFF(ebx,PW_THREE)] + pmullw mm4, [GOTOFF(ebx,PW_THREE)] + paddw mm1, [GOTOFF(ebx,PW_EIGHT)] + paddw mm0, [GOTOFF(ebx,PW_EIGHT)] + paddw mm7, [GOTOFF(ebx,PW_SEVEN)] + paddw mm5, [GOTOFF(ebx,PW_SEVEN)] + + paddw mm1, mm6 + paddw mm0, mm4 + psrlw mm1, 4 ; mm1=Out1LE=( 0 2 4 6) + psrlw mm0, 4 ; mm0=Out1HE=( 8 10 12 14) + paddw mm7, mm6 + paddw mm5, mm4 + psrlw mm7, 4 ; mm7=Out1LO=( 1 3 5 7) + psrlw mm5, 4 ; mm5=Out1HO=( 9 11 13 15) + + psllw mm7, BYTE_BIT + psllw mm5, BYTE_BIT + por mm1, mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) + por mm0, mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 + + poppic ebx + + sub eax, byte SIZEOF_MMWORD + add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_MMWORD ; inptr0 + add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_MMWORD ; outptr0 + add edi, byte 2*SIZEOF_MMWORD ; outptr1 + cmp eax, byte SIZEOF_MMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define output_width(b) (b) + 12 ; JDIMENSION output_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx) + +EXTN(jsimd_h2v1_upsample_mmx): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_MMWORD)-1 + and edx, byte -(2*SIZEOF_MMWORD) + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax, edx ; colctr + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + + movq mm1, mm0 + punpcklbw mm0, mm0 + punpckhbw mm1, mm1 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm3, mm2 + punpcklbw mm2, mm2 + punpckhbw mm3, mm3 + + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_MMWORD ; inptr + add edi, byte 4*SIZEOF_MMWORD ; outptr + jmp short .columnloop + alignx 16, 7 + +.nextrow: + pop esi + pop edi + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define output_width(b) (b) + 12 ; JDIMENSION output_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx) + +EXTN(jsimd_h2v2_upsample_mmx): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_MMWORD)-1 + and edx, byte -(2*SIZEOF_MMWORD) + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax, edx ; colctr + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + + movq mm1, mm0 + punpcklbw mm0, mm0 + punpckhbw mm1, mm1 + + movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 + movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm3, mm2 + punpcklbw mm2, mm2 + punpckhbw mm3, mm3 + + movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 + movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_MMWORD ; inptr + add ebx, byte 4*SIZEOF_MMWORD ; outptr0 + add edi, byte 4*SIZEOF_MMWORD ; outptr1 + jmp short .columnloop + alignx 16, 7 + +.nextrow: + pop esi + pop edi + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg short .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jdsample-sse2.asm b/media/libjpeg/simd/i386/jdsample-sse2.asm new file mode 100644 index 0000000000..4e28d2f4b8 --- /dev/null +++ b/media/libjpeg/simd/i386/jdsample-sse2.asm @@ -0,0 +1,724 @@ +; +; jdsample.asm - upsampling (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fancy_upsample_sse2) + +EXTN(jconst_fancy_upsample_sse2): + +PW_ONE times 8 dw 1 +PW_TWO times 8 dw 2 +PW_THREE times 8 dw 3 +PW_SEVEN times 8 dw 7 +PW_EIGHT times 8 dw 8 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2) + +EXTN(jsimd_h2v1_fancy_upsample_sse2): + push ebp + mov ebp, esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push eax ; colctr + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + test eax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + pxor xmm0, xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7, xmm7 + psrldq xmm7, (SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] + + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16, 7 + +.columnloop_last: + pcmpeqb xmm6, xmm6 + pslldq xmm6, (SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] + jmp short .upsample + alignx 16, 7 + +.columnloop: + movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] + pslldq xmm6, (SIZEOF_XMMWORD-1) + +.upsample: + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2, xmm1 + movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7, xmm1 + psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4, xmm1 + punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm2 + punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6, xmm3 + punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1, [GOTOFF(ebx,PW_THREE)] + pmullw xmm4, [GOTOFF(ebx,PW_THREE)] + paddw xmm2, [GOTOFF(ebx,PW_ONE)] + paddw xmm5, [GOTOFF(ebx,PW_ONE)] + paddw xmm3, [GOTOFF(ebx,PW_TWO)] + paddw xmm6, [GOTOFF(ebx,PW_TWO)] + + paddw xmm2, xmm1 + paddw xmm5, xmm4 + psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3, xmm1 + paddw xmm6, xmm4 + psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3, BYTE_BIT + psllw xmm6, BYTE_BIT + por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 + + sub eax, byte SIZEOF_XMMWORD + add esi, byte 1*SIZEOF_XMMWORD ; inptr + add edi, byte 2*SIZEOF_XMMWORD ; outptr + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2) + +EXTN(jsimd_h2v2_fancy_upsample_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx, eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_XMMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx +.skip: + ; -- process the first column block + + movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor xmm3, xmm3 ; xmm3=(all 0's) + movdqa xmm4, xmm0 + punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0, [GOTOFF(ebx,PW_THREE)] + pmullw xmm4, [GOTOFF(ebx,PW_THREE)] + + pcmpeqb xmm7, xmm7 + psrldq xmm7, (SIZEOF_XMMWORD-2) + + paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 + + pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 + + poppic ebx + + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16, 7 + +.columnloop_last: + ; -- process the last column block + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pcmpeqb xmm1, xmm1 + pslldq xmm1, (SIZEOF_XMMWORD-2) + movdqa xmm2, xmm1 + + pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] + + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + + jmp near .upsample + alignx 16, 7 + +.columnloop: + ; -- process the next column block + + movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor xmm3, xmm3 ; xmm3=(all 0's) + movdqa xmm4, xmm0 + punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0, [GOTOFF(ebx,PW_THREE)] + pmullw xmm4, [GOTOFF(ebx,PW_THREE)] + + paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 + + pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 + +.upsample: + ; -- process the upper row + + movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] + + movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5, xmm7 + movdqa xmm6, xmm3 + psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1, xmm7 + movdqa xmm2, xmm3 + pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4, xmm3 + psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7, [GOTOFF(ebx,PW_THREE)] + pmullw xmm3, [GOTOFF(ebx,PW_THREE)] + paddw xmm1, [GOTOFF(ebx,PW_EIGHT)] + paddw xmm5, [GOTOFF(ebx,PW_EIGHT)] + paddw xmm0, [GOTOFF(ebx,PW_SEVEN)] + paddw xmm2, [GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1, xmm7 + paddw xmm5, xmm3 + psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0, xmm7 + paddw xmm2, xmm3 + psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0, BYTE_BIT + psllw xmm2, BYTE_BIT + por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] + + movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0, xmm6 + movdqa xmm2, xmm4 + psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1, xmm6 + movdqa xmm5, xmm4 + pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3, xmm4 + psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6, [GOTOFF(ebx,PW_THREE)] + pmullw xmm4, [GOTOFF(ebx,PW_THREE)] + paddw xmm1, [GOTOFF(ebx,PW_EIGHT)] + paddw xmm0, [GOTOFF(ebx,PW_EIGHT)] + paddw xmm7, [GOTOFF(ebx,PW_SEVEN)] + paddw xmm5, [GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1, xmm6 + paddw xmm0, xmm4 + psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7, xmm6 + paddw xmm5, xmm4 + psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7, BYTE_BIT + psllw xmm5, BYTE_BIT + por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 + + poppic ebx + + sub eax, byte SIZEOF_XMMWORD + add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 + add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_XMMWORD ; outptr0 + add edi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define output_width(b) (b) + 12 ; JDIMENSION output_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2) + +EXTN(jsimd_h2v1_upsample_sse2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax, edx ; colctr + alignx 16, 7 +.columnloop: + + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm2 + punpckhbw xmm3, xmm3 + + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop + alignx 16, 7 + +.nextrow: + pop esi + pop edi + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor +%define output_width(b) (b) + 12 ; JDIMENSION output_width +%define input_data(b) (b) + 16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2) + +EXTN(jsimd_h2v2_upsample_sse2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax, edx ; colctr + alignx 16, 7 +.columnloop: + + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + + movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm2 + punpckhbw xmm3, xmm3 + + movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 + add edi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop + alignx 16, 7 + +.nextrow: + pop esi + pop edi + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg short .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jfdctflt-3dn.asm b/media/libjpeg/simd/i386/jfdctflt-3dn.asm new file mode 100644 index 0000000000..322ab16325 --- /dev/null +++ b/media/libjpeg/simd/i386/jfdctflt-3dn.asm @@ -0,0 +1,318 @@ +; +; jfdctflt.asm - floating-point FDCT (3DNow!) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the forward DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_float_3dnow) + +EXTN(jconst_fdct_float_3dnow): + +PD_0_382 times 2 dd 0.382683432365089771728460 +PD_0_707 times 2 dd 0.707106781186547524400844 +PD_0_541 times 2 dd 0.541196100146196984399723 +PD_1_306 times 2 dd 1.306562964876376527856643 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_float_3dnow(FAST_FLOAT *data) +; + +%define data(b) (b) + 8 ; FAST_FLOAT *data + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_float_3dnow) + +EXTN(jsimd_fdct_float_3dnow): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/2 + alignx 16, 7 +.rowloop: + + movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] + + ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) + + movq mm4, mm0 ; transpose coefficients + punpckldq mm0, mm1 ; mm0=(00 10)=data0 + punpckhdq mm4, mm1 ; mm4=(01 11)=data1 + movq mm5, mm2 ; transpose coefficients + punpckldq mm2, mm3 ; mm2=(06 16)=data6 + punpckhdq mm5, mm3 ; mm5=(07 17)=data7 + + movq mm6, mm4 + movq mm7, mm0 + pfsub mm4, mm2 ; mm4=data1-data6=tmp6 + pfsub mm0, mm5 ; mm0=data0-data7=tmp7 + pfadd mm6, mm2 ; mm6=data1+data6=tmp1 + pfadd mm7, mm5 ; mm7=data0+data7=tmp0 + + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] + + ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) + + movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 + + movq mm4, mm1 ; transpose coefficients + punpckldq mm1, mm3 ; mm1=(02 12)=data2 + punpckhdq mm4, mm3 ; mm4=(03 13)=data3 + movq mm0, mm2 ; transpose coefficients + punpckldq mm2, mm5 ; mm2=(04 14)=data4 + punpckhdq mm0, mm5 ; mm0=(05 15)=data5 + + movq mm3, mm4 + movq mm5, mm1 + pfadd mm4, mm2 ; mm4=data3+data4=tmp3 + pfadd mm1, mm0 ; mm1=data2+data5=tmp2 + pfsub mm3, mm2 ; mm3=data3-data4=tmp4 + pfsub mm5, mm0 ; mm5=data2-data5=tmp5 + + ; -- Even part + + movq mm2, mm7 + movq mm0, mm6 + pfsub mm7, mm4 ; mm7=tmp13 + pfsub mm6, mm1 ; mm6=tmp12 + pfadd mm2, mm4 ; mm2=tmp10 + pfadd mm0, mm1 ; mm0=tmp11 + + pfadd mm6, mm7 + pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1 + + movq mm4, mm2 + movq mm1, mm7 + pfsub mm2, mm0 ; mm2=data4 + pfsub mm7, mm6 ; mm7=data6 + pfadd mm4, mm0 ; mm4=data0 + pfadd mm1, mm6 ; mm1=data2 + + movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 + + ; -- Odd part + + movq mm0, MMWORD [wk(0)] ; mm0=tmp6 + movq mm6, MMWORD [wk(1)] ; mm6=tmp7 + + pfadd mm3, mm5 ; mm3=tmp10 + pfadd mm5, mm0 ; mm5=tmp11 + pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7 + + pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3 + + movq mm2, mm3 ; mm2=tmp10 + pfsub mm3, mm0 + pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5 + pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) + pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) + pfadd mm2, mm3 ; mm2=z2 + pfadd mm0, mm3 ; mm0=z4 + + movq mm7, mm6 + pfsub mm6, mm5 ; mm6=z13 + pfadd mm7, mm5 ; mm7=z11 + + movq mm4, mm6 + movq mm1, mm7 + pfsub mm6, mm2 ; mm6=data3 + pfsub mm7, mm0 ; mm7=data7 + pfadd mm4, mm2 ; mm4=data5 + pfadd mm1, mm0 ; mm1=data1 + + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/2 + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] + + ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) + + movq mm4, mm0 ; transpose coefficients + punpckldq mm0, mm1 ; mm0=(00 01)=data0 + punpckhdq mm4, mm1 ; mm4=(10 11)=data1 + movq mm5, mm2 ; transpose coefficients + punpckldq mm2, mm3 ; mm2=(60 61)=data6 + punpckhdq mm5, mm3 ; mm5=(70 71)=data7 + + movq mm6, mm4 + movq mm7, mm0 + pfsub mm4, mm2 ; mm4=data1-data6=tmp6 + pfsub mm0, mm5 ; mm0=data0-data7=tmp7 + pfadd mm6, mm2 ; mm6=data1+data6=tmp1 + pfadd mm7, mm5 ; mm7=data0+data7=tmp0 + + movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] + + ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) + + movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 + + movq mm4, mm1 ; transpose coefficients + punpckldq mm1, mm3 ; mm1=(20 21)=data2 + punpckhdq mm4, mm3 ; mm4=(30 31)=data3 + movq mm0, mm2 ; transpose coefficients + punpckldq mm2, mm5 ; mm2=(40 41)=data4 + punpckhdq mm0, mm5 ; mm0=(50 51)=data5 + + movq mm3, mm4 + movq mm5, mm1 + pfadd mm4, mm2 ; mm4=data3+data4=tmp3 + pfadd mm1, mm0 ; mm1=data2+data5=tmp2 + pfsub mm3, mm2 ; mm3=data3-data4=tmp4 + pfsub mm5, mm0 ; mm5=data2-data5=tmp5 + + ; -- Even part + + movq mm2, mm7 + movq mm0, mm6 + pfsub mm7, mm4 ; mm7=tmp13 + pfsub mm6, mm1 ; mm6=tmp12 + pfadd mm2, mm4 ; mm2=tmp10 + pfadd mm0, mm1 ; mm0=tmp11 + + pfadd mm6, mm7 + pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1 + + movq mm4, mm2 + movq mm1, mm7 + pfsub mm2, mm0 ; mm2=data4 + pfsub mm7, mm6 ; mm7=data6 + pfadd mm4, mm0 ; mm4=data0 + pfadd mm1, mm6 ; mm1=data2 + + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + ; -- Odd part + + movq mm0, MMWORD [wk(0)] ; mm0=tmp6 + movq mm6, MMWORD [wk(1)] ; mm6=tmp7 + + pfadd mm3, mm5 ; mm3=tmp10 + pfadd mm5, mm0 ; mm5=tmp11 + pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7 + + pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3 + + movq mm2, mm3 ; mm2=tmp10 + pfsub mm3, mm0 + pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5 + pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) + pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) + pfadd mm2, mm3 ; mm2=z2 + pfadd mm0, mm3 ; mm0=z4 + + movq mm7, mm6 + pfsub mm6, mm5 ; mm6=z13 + pfadd mm7, mm5 ; mm7=z11 + + movq mm4, mm6 + movq mm1, mm7 + pfsub mm6, mm2 ; mm6=data3 + pfsub mm7, mm0 ; mm7=data7 + pfadd mm4, mm2 ; mm4=data5 + pfadd mm1, mm0 ; mm1=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + add edx, byte 2*SIZEOF_FAST_FLOAT + dec ecx + jnz near .columnloop + + femms ; empty MMX/3DNow! state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jfdctflt-sse.asm b/media/libjpeg/simd/i386/jfdctflt-sse.asm new file mode 100644 index 0000000000..86952c6499 --- /dev/null +++ b/media/libjpeg/simd/i386/jfdctflt-sse.asm @@ -0,0 +1,369 @@ +; +; jfdctflt.asm - floating-point FDCT (SSE) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the forward DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_float_sse) + +EXTN(jconst_fdct_float_sse): + +PD_0_382 times 4 dd 0.382683432365089771728460 +PD_0_707 times 4 dd 0.707106781186547524400844 +PD_0_541 times 4 dd 0.541196100146196984399723 +PD_1_306 times 4 dd 1.306562964876376527856643 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_float_sse(FAST_FLOAT *data) +; + +%define data(b) (b) + 8 ; FAST_FLOAT *data + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_float_sse) + +EXTN(jsimd_fdct_float_sse): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.rowloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) + ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) + + movaps xmm4, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31) + unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33) + movaps xmm5, xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35) + unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) + ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) + + movaps xmm4, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13) + movaps xmm2, xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15) + unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17) + + movaps xmm7, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0 + unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1 + movaps xmm3, xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6 + unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7 + + movaps xmm0, xmm7 + movaps xmm5, xmm6 + subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2 + unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3 + movaps xmm6, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4 + unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5 + + movaps xmm2, xmm7 + movaps xmm3, xmm4 + addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1, xmm5 + movaps xmm6, xmm0 + subps xmm5, xmm7 ; xmm5=tmp13 + subps xmm0, xmm4 ; xmm0=tmp12 + addps xmm1, xmm7 ; xmm1=tmp10 + addps xmm6, xmm4 ; xmm6=tmp11 + + addps xmm0, xmm5 + mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1 + + movaps xmm7, xmm1 + movaps xmm4, xmm5 + subps xmm1, xmm6 ; xmm1=data4 + subps xmm5, xmm0 ; xmm5=data6 + addps xmm7, xmm6 ; xmm7=data0 + addps xmm4, xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2, xmm3 ; xmm2=tmp10 + addps xmm3, xmm6 ; xmm3=tmp11 + addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3 + + movaps xmm1, xmm2 ; xmm1=tmp10 + subps xmm2, xmm6 + mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5 + mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1, xmm2 ; xmm1=z2 + addps xmm6, xmm2 ; xmm6=z4 + + movaps xmm5, xmm0 + subps xmm0, xmm3 ; xmm0=z13 + addps xmm5, xmm3 ; xmm5=z11 + + movaps xmm7, xmm0 + movaps xmm4, xmm5 + subps xmm0, xmm1 ; xmm0=data3 + subps xmm5, xmm6 ; xmm5=data7 + addps xmm7, xmm1 ; xmm7=data5 + addps xmm4, xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.columnloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) + ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) + + movaps xmm4, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13) + unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33) + movaps xmm5, xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53) + unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) + ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) + + movaps xmm4, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11) + unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31) + movaps xmm2, xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51) + unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71) + + movaps xmm7, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0 + unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1 + movaps xmm3, xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6 + unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7 + + movaps xmm0, xmm7 + movaps xmm5, xmm6 + subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2 + unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3 + movaps xmm6, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4 + unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5 + + movaps xmm2, xmm7 + movaps xmm3, xmm4 + addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1, xmm5 + movaps xmm6, xmm0 + subps xmm5, xmm7 ; xmm5=tmp13 + subps xmm0, xmm4 ; xmm0=tmp12 + addps xmm1, xmm7 ; xmm1=tmp10 + addps xmm6, xmm4 ; xmm6=tmp11 + + addps xmm0, xmm5 + mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1 + + movaps xmm7, xmm1 + movaps xmm4, xmm5 + subps xmm1, xmm6 ; xmm1=data4 + subps xmm5, xmm0 ; xmm5=data6 + addps xmm7, xmm6 ; xmm7=data0 + addps xmm4, xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2, xmm3 ; xmm2=tmp10 + addps xmm3, xmm6 ; xmm3=tmp11 + addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3 + + movaps xmm1, xmm2 ; xmm1=tmp10 + subps xmm2, xmm6 + mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5 + mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1, xmm2 ; xmm1=z2 + addps xmm6, xmm2 ; xmm6=z4 + + movaps xmm5, xmm0 + subps xmm0, xmm3 ; xmm0=z13 + addps xmm5, xmm3 ; xmm5=z11 + + movaps xmm7, xmm0 + movaps xmm4, xmm5 + subps xmm0, xmm1 ; xmm0=data3 + subps xmm5, xmm6 ; xmm5=data7 + addps xmm7, xmm1 ; xmm7=data5 + addps xmm4, xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + add edx, byte 4*SIZEOF_FAST_FLOAT + dec ecx + jnz near .columnloop + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jfdctfst-mmx.asm b/media/libjpeg/simd/i386/jfdctfst-mmx.asm new file mode 100644 index 0000000000..80645a50d7 --- /dev/null +++ b/media/libjpeg/simd/i386/jfdctfst-mmx.asm @@ -0,0 +1,395 @@ +; +; jfdctfst.asm - fast integer FDCT (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the forward DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c +; for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. + +%if CONST_BITS == 8 +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 32 + GLOBAL_DATA(jconst_fdct_ifast_mmx) + +EXTN(jconst_fdct_ifast_mmx): + +PW_F0707 times 4 dw F_0_707 << CONST_SHIFT +PW_F0382 times 4 dw F_0_382 << CONST_SHIFT +PW_F0541 times 4 dw F_0_541 << CONST_SHIFT +PW_F1306 times 4 dw F_1_306 << CONST_SHIFT + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_ifast_mmx(DCTELEM *data) +; + +%define data(b) (b) + 8 ; DCTELEM *data + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_ifast_mmx) + +EXTN(jsimd_fdct_ifast_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.rowloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] + + ; mm0=(20 21 22 23), mm2=(24 25 26 27) + ; mm1=(30 31 32 33), mm3=(34 35 36 37) + + movq mm4, mm0 ; transpose coefficients(phase 1) + punpcklwd mm0, mm1 ; mm0=(20 30 21 31) + punpckhwd mm4, mm1 ; mm4=(22 32 23 33) + movq mm5, mm2 ; transpose coefficients(phase 1) + punpcklwd mm2, mm3 ; mm2=(24 34 25 35) + punpckhwd mm5, mm3 ; mm5=(26 36 27 37) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 01 02 03), mm1=(04 05 06 07) + ; mm7=(10 11 12 13), mm3=(14 15 16 17) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) + + movq mm4, mm6 ; transpose coefficients(phase 1) + punpcklwd mm6, mm7 ; mm6=(00 10 01 11) + punpckhwd mm4, mm7 ; mm4=(02 12 03 13) + movq mm2, mm1 ; transpose coefficients(phase 1) + punpcklwd mm1, mm3 ; mm1=(04 14 05 15) + punpckhwd mm2, mm3 ; mm2=(06 16 07 17) + + movq mm7, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0 + punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1 + movq mm3, mm2 ; transpose coefficients(phase 2) + punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6 + punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7 + + movq mm0, mm7 + movq mm5, mm6 + psubw mm7, mm2 ; mm7=data1-data6=tmp6 + psubw mm6, mm3 ; mm6=data0-data7=tmp7 + paddw mm0, mm2 ; mm0=data1+data6=tmp1 + paddw mm5, mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) + movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7, mm4 ; transpose coefficients(phase 2) + punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2 + punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3 + movq mm6, mm1 ; transpose coefficients(phase 2) + punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4 + punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5 + + movq mm2, mm7 + movq mm3, mm4 + paddw mm7, mm1 ; mm7=data3+data4=tmp3 + paddw mm4, mm6 ; mm4=data2+data5=tmp2 + psubw mm2, mm1 ; mm2=data3-data4=tmp4 + psubw mm3, mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1, mm5 + movq mm6, mm0 + psubw mm5, mm7 ; mm5=tmp13 + psubw mm0, mm4 ; mm0=tmp12 + paddw mm1, mm7 ; mm1=tmp10 + paddw mm6, mm4 ; mm6=tmp11 + + paddw mm0, mm5 + psllw mm0, PRE_MULTIPLY_SCALE_BITS + pmulhw mm0, [GOTOFF(ebx,PW_F0707)] ; mm0=z1 + + movq mm7, mm1 + movq mm4, mm5 + psubw mm1, mm6 ; mm1=data4 + psubw mm5, mm0 ; mm5=data6 + paddw mm7, mm6 ; mm7=data0 + paddw mm4, mm0 ; mm4=data2 + + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + + ; -- Odd part + + movq mm6, MMWORD [wk(0)] ; mm6=tmp6 + movq mm0, MMWORD [wk(1)] ; mm0=tmp7 + + paddw mm2, mm3 ; mm2=tmp10 + paddw mm3, mm6 ; mm3=tmp11 + paddw mm6, mm0 ; mm6=tmp12, mm0=tmp7 + + psllw mm2, PRE_MULTIPLY_SCALE_BITS + psllw mm6, PRE_MULTIPLY_SCALE_BITS + + psllw mm3, PRE_MULTIPLY_SCALE_BITS + pmulhw mm3, [GOTOFF(ebx,PW_F0707)] ; mm3=z3 + + movq mm1, mm2 ; mm1=tmp10 + psubw mm2, mm6 + pmulhw mm2, [GOTOFF(ebx,PW_F0382)] ; mm2=z5 + pmulhw mm1, [GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) + pmulhw mm6, [GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) + paddw mm1, mm2 ; mm1=z2 + paddw mm6, mm2 ; mm6=z4 + + movq mm5, mm0 + psubw mm0, mm3 ; mm0=z13 + paddw mm5, mm3 ; mm5=z11 + + movq mm7, mm0 + movq mm4, mm5 + psubw mm0, mm1 ; mm0=data3 + psubw mm5, mm6 ; mm5=data7 + paddw mm7, mm1 ; mm7=data5 + paddw mm4, mm6 ; mm4=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 + + add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; mm0=(02 12 22 32), mm2=(42 52 62 72) + ; mm1=(03 13 23 33), mm3=(43 53 63 73) + + movq mm4, mm0 ; transpose coefficients(phase 1) + punpcklwd mm0, mm1 ; mm0=(02 03 12 13) + punpckhwd mm4, mm1 ; mm4=(22 23 32 33) + movq mm5, mm2 ; transpose coefficients(phase 1) + punpcklwd mm2, mm3 ; mm2=(42 43 52 53) + punpckhwd mm5, mm3 ; mm5=(62 63 72 73) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 10 20 30), mm1=(40 50 60 70) + ; mm7=(01 11 21 31), mm3=(41 51 61 71) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) + + movq mm4, mm6 ; transpose coefficients(phase 1) + punpcklwd mm6, mm7 ; mm6=(00 01 10 11) + punpckhwd mm4, mm7 ; mm4=(20 21 30 31) + movq mm2, mm1 ; transpose coefficients(phase 1) + punpcklwd mm1, mm3 ; mm1=(40 41 50 51) + punpckhwd mm2, mm3 ; mm2=(60 61 70 71) + + movq mm7, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0 + punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1 + movq mm3, mm2 ; transpose coefficients(phase 2) + punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6 + punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7 + + movq mm0, mm7 + movq mm5, mm6 + psubw mm7, mm2 ; mm7=data1-data6=tmp6 + psubw mm6, mm3 ; mm6=data0-data7=tmp7 + paddw mm0, mm2 ; mm0=data1+data6=tmp1 + paddw mm5, mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) + movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7, mm4 ; transpose coefficients(phase 2) + punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2 + punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3 + movq mm6, mm1 ; transpose coefficients(phase 2) + punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4 + punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5 + + movq mm2, mm7 + movq mm3, mm4 + paddw mm7, mm1 ; mm7=data3+data4=tmp3 + paddw mm4, mm6 ; mm4=data2+data5=tmp2 + psubw mm2, mm1 ; mm2=data3-data4=tmp4 + psubw mm3, mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1, mm5 + movq mm6, mm0 + psubw mm5, mm7 ; mm5=tmp13 + psubw mm0, mm4 ; mm0=tmp12 + paddw mm1, mm7 ; mm1=tmp10 + paddw mm6, mm4 ; mm6=tmp11 + + paddw mm0, mm5 + psllw mm0, PRE_MULTIPLY_SCALE_BITS + pmulhw mm0, [GOTOFF(ebx,PW_F0707)] ; mm0=z1 + + movq mm7, mm1 + movq mm4, mm5 + psubw mm1, mm6 ; mm1=data4 + psubw mm5, mm0 ; mm5=data6 + paddw mm7, mm6 ; mm7=data0 + paddw mm4, mm0 ; mm4=data2 + + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + + ; -- Odd part + + movq mm6, MMWORD [wk(0)] ; mm6=tmp6 + movq mm0, MMWORD [wk(1)] ; mm0=tmp7 + + paddw mm2, mm3 ; mm2=tmp10 + paddw mm3, mm6 ; mm3=tmp11 + paddw mm6, mm0 ; mm6=tmp12, mm0=tmp7 + + psllw mm2, PRE_MULTIPLY_SCALE_BITS + psllw mm6, PRE_MULTIPLY_SCALE_BITS + + psllw mm3, PRE_MULTIPLY_SCALE_BITS + pmulhw mm3, [GOTOFF(ebx,PW_F0707)] ; mm3=z3 + + movq mm1, mm2 ; mm1=tmp10 + psubw mm2, mm6 + pmulhw mm2, [GOTOFF(ebx,PW_F0382)] ; mm2=z5 + pmulhw mm1, [GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) + pmulhw mm6, [GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) + paddw mm1, mm2 ; mm1=z2 + paddw mm6, mm2 ; mm6=z4 + + movq mm5, mm0 + psubw mm0, mm3 ; mm0=z13 + paddw mm5, mm3 ; mm5=z11 + + movq mm7, mm0 + movq mm4, mm5 + psubw mm0, mm1 ; mm0=data3 + psubw mm5, mm6 ; mm5=data7 + paddw mm7, mm1 ; mm7=data5 + paddw mm4, mm6 ; mm4=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 + + add edx, byte 4*SIZEOF_DCTELEM + dec ecx + jnz near .columnloop + + emms ; empty MMX state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jfdctfst-sse2.asm b/media/libjpeg/simd/i386/jfdctfst-sse2.asm new file mode 100644 index 0000000000..446fa7a68f --- /dev/null +++ b/media/libjpeg/simd/i386/jfdctfst-sse2.asm @@ -0,0 +1,403 @@ +; +; jfdctfst.asm - fast integer FDCT (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the forward DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c +; for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. + +%if CONST_BITS == 8 +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 32 + GLOBAL_DATA(jconst_fdct_ifast_sse2) + +EXTN(jconst_fdct_ifast_sse2): + +PW_F0707 times 8 dw F_0_707 << CONST_SHIFT +PW_F0382 times 8 dw F_0_382 << CONST_SHIFT +PW_F0541 times 8 dw F_0_541 << CONST_SHIFT +PW_F1306 times 8 dw F_1_306 << CONST_SHIFT + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_ifast_sse2(DCTELEM *data) +; + +%define data(b) (b) + 8 ; DCTELEM *data + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2) + +EXTN(jsimd_fdct_ifast_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6, xmm1 + movdqa xmm3, xmm0 + psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2, xmm1 + movdqa xmm5, xmm7 + paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm0, xmm6 + psubw xmm3, xmm1 ; xmm3=tmp13 + psubw xmm6, xmm7 ; xmm6=tmp12 + paddw xmm4, xmm1 ; xmm4=tmp10 + paddw xmm0, xmm7 ; xmm0=tmp11 + + paddw xmm6, xmm3 + psllw xmm6, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6, [GOTOFF(ebx,PW_F0707)] ; xmm6=z1 + + movdqa xmm1, xmm4 + movdqa xmm7, xmm3 + psubw xmm4, xmm0 ; xmm4=data4 + psubw xmm3, xmm6 ; xmm3=data6 + paddw xmm1, xmm0 ; xmm1=data0 + paddw xmm7, xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2, xmm5 ; xmm2=tmp10 + paddw xmm5, xmm0 ; xmm5=tmp11 + paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z3 + + movdqa xmm4, xmm2 ; xmm4=tmp10 + psubw xmm2, xmm0 + pmulhw xmm2, [GOTOFF(ebx,PW_F0382)] ; xmm2=z5 + pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0, [GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4, xmm2 ; xmm4=z2 + paddw xmm0, xmm2 ; xmm0=z4 + + movdqa xmm3, xmm6 + psubw xmm6, xmm5 ; xmm6=z13 + paddw xmm3, xmm5 ; xmm3=z11 + + movdqa xmm2, xmm6 + movdqa xmm5, xmm3 + psubw xmm6, xmm4 ; xmm6=data3 + psubw xmm3, xmm0 ; xmm3=data7 + paddw xmm2, xmm4 ; xmm2=data5 + paddw xmm5, xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + +; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2, xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3, xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5, xmm6 + movdqa xmm3, xmm1 + psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7, xmm6 + movdqa xmm0, xmm2 + paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm1, xmm5 + psubw xmm3, xmm6 ; xmm3=tmp13 + psubw xmm5, xmm2 ; xmm5=tmp12 + paddw xmm4, xmm6 ; xmm4=tmp10 + paddw xmm1, xmm2 ; xmm1=tmp11 + + paddw xmm5, xmm3 + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z1 + + movdqa xmm6, xmm4 + movdqa xmm2, xmm3 + psubw xmm4, xmm1 ; xmm4=data4 + psubw xmm3, xmm5 ; xmm3=data6 + paddw xmm6, xmm1 ; xmm6=data0 + paddw xmm2, xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7, xmm0 ; xmm7=tmp10 + paddw xmm0, xmm1 ; xmm0=tmp11 + paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7, PRE_MULTIPLY_SCALE_BITS + psllw xmm1, PRE_MULTIPLY_SCALE_BITS + + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0, [GOTOFF(ebx,PW_F0707)] ; xmm0=z3 + + movdqa xmm4, xmm7 ; xmm4=tmp10 + psubw xmm7, xmm1 + pmulhw xmm7, [GOTOFF(ebx,PW_F0382)] ; xmm7=z5 + pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1, [GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4, xmm7 ; xmm4=z2 + paddw xmm1, xmm7 ; xmm1=z4 + + movdqa xmm3, xmm5 + psubw xmm5, xmm0 ; xmm5=z13 + paddw xmm3, xmm0 ; xmm3=z11 + + movdqa xmm6, xmm5 + movdqa xmm2, xmm3 + psubw xmm5, xmm4 ; xmm5=data3 + psubw xmm3, xmm1 ; xmm3=data7 + paddw xmm6, xmm4 ; xmm6=data5 + paddw xmm2, xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jfdctint-avx2.asm b/media/libjpeg/simd/i386/jfdctint-avx2.asm new file mode 100644 index 0000000000..23cf733135 --- /dev/null +++ b/media/libjpeg/simd/i386/jfdctint-avx2.asm @@ -0,0 +1,331 @@ +; +; jfdctint.asm - accurate integer FDCT (AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit matrix transpose using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers + +%macro dotranspose 8 + ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) + + vpunpcklwd %5, %1, %2 + vpunpckhwd %6, %1, %2 + vpunpcklwd %7, %3, %4 + vpunpckhwd %8, %3, %4 + ; transpose coefficients(phase 1) + ; %5=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53) + ; %6=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57) + ; %7=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73) + ; %8=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77) + + vpunpckldq %1, %5, %7 + vpunpckhdq %2, %5, %7 + vpunpckldq %3, %6, %8 + vpunpckhdq %4, %6, %8 + ; transpose coefficients(phase 2) + ; %1=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71) + ; %2=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73) + ; %3=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75) + ; %4=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77) + + vpermq %1, %1, 0x8D + vpermq %2, %2, 0x8D + vpermq %3, %3, 0xD8 + vpermq %4, %4, 0xD8 + ; transpose coefficients(phase 3) + ; %1=(01 11 21 31 41 51 61 71 00 10 20 30 40 50 60 70) + ; %2=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) + ; %3=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) + ; %4=(06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77) +%endmacro + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers +; %9: Pass (1 or 2) + +%macro dodct 9 + vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7 + vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0 + vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2 + vpsubw %8, %2, %3 ; %8=data3_2-data4_5=tmp4_5 + + ; -- Even part + + vperm2i128 %6, %6, %6, 0x01 ; %6=tmp0_1 + vpaddw %1, %6, %7 ; %1=tmp0_1+tmp3_2=tmp10_11 + vpsubw %6, %6, %7 ; %6=tmp0_1-tmp3_2=tmp13_12 + + vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10 + vpsignw %1, %1, [GOTOFF(ebx, PW_1_NEG1)] ; %1=tmp10_neg11 + vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11) +%if %9 == 1 + vpsllw %1, %7, PASS1_BITS ; %1=data0_4 +%else + vpaddw %7, %7, [GOTOFF(ebx, PW_DESCALE_P2X)] + vpsraw %1, %7, PASS1_BITS ; %1=data0_4 +%endif + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + vperm2i128 %7, %6, %6, 0x01 ; %7=tmp12_13 + vpunpcklwd %2, %6, %7 + vpunpckhwd %6, %6, %7 + vpmaddwd %2, %2, [GOTOFF(ebx, PW_F130_F054_MF130_F054)] ; %2=data2_6L + vpmaddwd %6, %6, [GOTOFF(ebx, PW_F130_F054_MF130_F054)] ; %6=data2_6H + + vpaddd %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)] + vpaddd %6, %6, [GOTOFF(ebx, PD_DESCALE_P %+ %9)] + vpsrad %2, %2, DESCALE_P %+ %9 + vpsrad %6, %6, DESCALE_P %+ %9 + + vpackssdw %3, %2, %6 ; %6=data2_6 + + ; -- Odd part + + vpaddw %7, %8, %5 ; %7=tmp4_5+tmp6_7=z3_4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + vperm2i128 %2, %7, %7, 0x01 ; %2=z4_3 + vpunpcklwd %6, %7, %2 + vpunpckhwd %7, %7, %2 + vpmaddwd %6, %6, [GOTOFF(ebx, PW_MF078_F117_F078_F117)] ; %6=z3_4L + vpmaddwd %7, %7, [GOTOFF(ebx, PW_MF078_F117_F078_F117)] ; %7=z3_4H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + vperm2i128 %4, %5, %5, 0x01 ; %4=tmp7_6 + vpunpcklwd %2, %8, %4 + vpunpckhwd %4, %8, %4 + vpmaddwd %2, %2, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)] ; %2=tmp4_5L + vpmaddwd %4, %4, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)] ; %4=tmp4_5H + + vpaddd %2, %2, %6 ; %2=data7_5L + vpaddd %4, %4, %7 ; %4=data7_5H + + vpaddd %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)] + vpaddd %4, %4, [GOTOFF(ebx, PD_DESCALE_P %+ %9)] + vpsrad %2, %2, DESCALE_P %+ %9 + vpsrad %4, %4, DESCALE_P %+ %9 + + vpackssdw %4, %2, %4 ; %4=data7_5 + + vperm2i128 %2, %8, %8, 0x01 ; %2=tmp5_4 + vpunpcklwd %8, %5, %2 + vpunpckhwd %5, %5, %2 + vpmaddwd %8, %8, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)] ; %8=tmp6_7L + vpmaddwd %5, %5, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)] ; %5=tmp6_7H + + vpaddd %8, %8, %6 ; %8=data3_1L + vpaddd %5, %5, %7 ; %5=data3_1H + + vpaddd %8, %8, [GOTOFF(ebx, PD_DESCALE_P %+ %9)] + vpaddd %5, %5, [GOTOFF(ebx, PD_DESCALE_P %+ %9)] + vpsrad %8, %8, DESCALE_P %+ %9 + vpsrad %5, %5, DESCALE_P %+ %9 + + vpackssdw %2, %8, %5 ; %2=data3_1 +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_islow_avx2) + +EXTN(jconst_fdct_islow_avx2): + +PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 + times 4 dw (F_0_541 - F_1_847), F_0_541 +PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 + times 4 dw (F_1_175 - F_0_390), F_1_175 +PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899 + times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_F050_MF256_F060_MF089 times 4 dw (F_3_072 - F_2_562), -F_2_562 + times 4 dw (F_1_501 - F_0_899), -F_0_899 +PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1) +PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1) +PW_1_NEG1 times 8 dw 1 + times 8 dw -1 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_avx2(DCTELEM *data) +; + +%define data(b) (b) + 8 ; DCTELEM *data + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_islow_avx2) + +EXTN(jsimd_fdct_islow_avx2): + push ebp + mov ebp, esp + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(ebp)] ; (DCTELEM *) + + vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + ; ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + ; ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + ; ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + ; ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + vperm2i128 ymm0, ymm4, ymm6, 0x20 + vperm2i128 ymm1, ymm4, ymm6, 0x31 + vperm2i128 ymm2, ymm5, ymm7, 0x20 + vperm2i128 ymm3, ymm5, ymm7, 0x31 + ; ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) + + dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + + dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1 + ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5 + + ; ---- Pass 2: process columns. + + vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7 + vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5 + + dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 + + dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2 + ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5 + + vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1 + vperm2i128 ymm5, ymm2, ymm1, 0x20 ; ymm5=data2_3 + vperm2i128 ymm6, ymm0, ymm4, 0x31 ; ymm6=data4_5 + vperm2i128 ymm7, ymm2, ymm4, 0x21 ; ymm7=data6_7 + + vmovdqu YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], ymm3 + vmovdqu YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], ymm5 + vmovdqu YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], ymm6 + vmovdqu YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], ymm7 + + vzeroupper +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jfdctint-mmx.asm b/media/libjpeg/simd/i386/jfdctint-mmx.asm new file mode 100644 index 0000000000..34a43b9e5e --- /dev/null +++ b/media/libjpeg/simd/i386/jfdctint-mmx.asm @@ -0,0 +1,620 @@ +; +; jfdctint.asm - accurate integer FDCT (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_islow_mmx) + +EXTN(jconst_fdct_islow_mmx): + +PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541 +PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847) +PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175 +PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390) +PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899 +PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899) +PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1) +PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_mmx(DCTELEM *data) +; + +%define data(b) (b) + 8 ; DCTELEM *data + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_islow_mmx) + +EXTN(jsimd_fdct_islow_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.rowloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] + + ; mm0=(20 21 22 23), mm2=(24 25 26 27) + ; mm1=(30 31 32 33), mm3=(34 35 36 37) + + movq mm4, mm0 ; transpose coefficients(phase 1) + punpcklwd mm0, mm1 ; mm0=(20 30 21 31) + punpckhwd mm4, mm1 ; mm4=(22 32 23 33) + movq mm5, mm2 ; transpose coefficients(phase 1) + punpcklwd mm2, mm3 ; mm2=(24 34 25 35) + punpckhwd mm5, mm3 ; mm5=(26 36 27 37) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 01 02 03), mm1=(04 05 06 07) + ; mm7=(10 11 12 13), mm3=(14 15 16 17) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) + + movq mm4, mm6 ; transpose coefficients(phase 1) + punpcklwd mm6, mm7 ; mm6=(00 10 01 11) + punpckhwd mm4, mm7 ; mm4=(02 12 03 13) + movq mm2, mm1 ; transpose coefficients(phase 1) + punpcklwd mm1, mm3 ; mm1=(04 14 05 15) + punpckhwd mm2, mm3 ; mm2=(06 16 07 17) + + movq mm7, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0 + punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1 + movq mm3, mm2 ; transpose coefficients(phase 2) + punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6 + punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7 + + movq mm0, mm7 + movq mm5, mm6 + psubw mm7, mm2 ; mm7=data1-data6=tmp6 + psubw mm6, mm3 ; mm6=data0-data7=tmp7 + paddw mm0, mm2 ; mm0=data1+data6=tmp1 + paddw mm5, mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) + movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7, mm4 ; transpose coefficients(phase 2) + punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2 + punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3 + movq mm6, mm1 ; transpose coefficients(phase 2) + punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4 + punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5 + + movq mm2, mm7 + movq mm3, mm4 + paddw mm7, mm1 ; mm7=data3+data4=tmp3 + paddw mm4, mm6 ; mm4=data2+data5=tmp2 + psubw mm2, mm1 ; mm2=data3-data4=tmp4 + psubw mm3, mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1, mm5 + movq mm6, mm0 + paddw mm5, mm7 ; mm5=tmp10 + paddw mm0, mm4 ; mm0=tmp11 + psubw mm1, mm7 ; mm1=tmp13 + psubw mm6, mm4 ; mm6=tmp12 + + movq mm7, mm5 + paddw mm5, mm0 ; mm5=tmp10+tmp11 + psubw mm7, mm0 ; mm7=tmp10-tmp11 + + psllw mm5, PASS1_BITS ; mm5=data0 + psllw mm7, PASS1_BITS ; mm7=data4 + + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movq mm4, mm1 ; mm1=tmp13 + movq mm0, mm1 + punpcklwd mm4, mm6 ; mm6=tmp12 + punpckhwd mm0, mm6 + movq mm1, mm4 + movq mm6, mm0 + pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L + pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H + pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L + pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H + + paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm0, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm4, DESCALE_P1 + psrad mm0, DESCALE_P1 + paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm6, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm1, DESCALE_P1 + psrad mm6, DESCALE_P1 + + packssdw mm4, mm0 ; mm4=data2 + packssdw mm1, mm6 ; mm1=data6 + + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1 + + ; -- Odd part + + movq mm5, MMWORD [wk(0)] ; mm5=tmp6 + movq mm7, MMWORD [wk(1)] ; mm7=tmp7 + + movq mm0, mm2 ; mm2=tmp4 + movq mm6, mm3 ; mm3=tmp5 + paddw mm0, mm5 ; mm0=z3 + paddw mm6, mm7 ; mm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm4, mm0 + movq mm1, mm0 + punpcklwd mm4, mm6 + punpckhwd mm1, mm6 + movq mm0, mm4 + movq mm6, mm1 + pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L + pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H + pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L + pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H + + movq MMWORD [wk(0)], mm4 ; wk(0)=z3L + movq MMWORD [wk(1)], mm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movq mm4, mm2 + movq mm1, mm2 + punpcklwd mm4, mm7 + punpckhwd mm1, mm7 + movq mm2, mm4 + movq mm7, mm1 + pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L + pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H + pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L + pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H + + paddd mm4, MMWORD [wk(0)] ; mm4=data7L + paddd mm1, MMWORD [wk(1)] ; mm1=data7H + paddd mm2, mm0 ; mm2=data1L + paddd mm7, mm6 ; mm7=data1H + + paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm4, DESCALE_P1 + psrad mm1, DESCALE_P1 + paddd mm2, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm2, DESCALE_P1 + psrad mm7, DESCALE_P1 + + packssdw mm4, mm1 ; mm4=data7 + packssdw mm2, mm7 ; mm2=data1 + + movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 + + movq mm1, mm3 + movq mm7, mm3 + punpcklwd mm1, mm5 + punpckhwd mm7, mm5 + movq mm3, mm1 + movq mm5, mm7 + pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L + pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H + pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L + pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H + + paddd mm1, mm0 ; mm1=data5L + paddd mm7, mm6 ; mm7=data5H + paddd mm3, MMWORD [wk(0)] ; mm3=data3L + paddd mm5, MMWORD [wk(1)] ; mm5=data3H + + paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm1, DESCALE_P1 + psrad mm7, DESCALE_P1 + paddd mm3, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm5, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm3, DESCALE_P1 + psrad mm5, DESCALE_P1 + + packssdw mm1, mm7 ; mm1=data5 + packssdw mm3, mm5 ; mm3=data3 + + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 + + add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.columnloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; mm0=(02 12 22 32), mm2=(42 52 62 72) + ; mm1=(03 13 23 33), mm3=(43 53 63 73) + + movq mm4, mm0 ; transpose coefficients(phase 1) + punpcklwd mm0, mm1 ; mm0=(02 03 12 13) + punpckhwd mm4, mm1 ; mm4=(22 23 32 33) + movq mm5, mm2 ; transpose coefficients(phase 1) + punpcklwd mm2, mm3 ; mm2=(42 43 52 53) + punpckhwd mm5, mm3 ; mm5=(62 63 72 73) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 10 20 30), mm1=(40 50 60 70) + ; mm7=(01 11 21 31), mm3=(41 51 61 71) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) + + movq mm4, mm6 ; transpose coefficients(phase 1) + punpcklwd mm6, mm7 ; mm6=(00 01 10 11) + punpckhwd mm4, mm7 ; mm4=(20 21 30 31) + movq mm2, mm1 ; transpose coefficients(phase 1) + punpcklwd mm1, mm3 ; mm1=(40 41 50 51) + punpckhwd mm2, mm3 ; mm2=(60 61 70 71) + + movq mm7, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0 + punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1 + movq mm3, mm2 ; transpose coefficients(phase 2) + punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6 + punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7 + + movq mm0, mm7 + movq mm5, mm6 + psubw mm7, mm2 ; mm7=data1-data6=tmp6 + psubw mm6, mm3 ; mm6=data0-data7=tmp7 + paddw mm0, mm2 ; mm0=data1+data6=tmp1 + paddw mm5, mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) + movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7, mm4 ; transpose coefficients(phase 2) + punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2 + punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3 + movq mm6, mm1 ; transpose coefficients(phase 2) + punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4 + punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5 + + movq mm2, mm7 + movq mm3, mm4 + paddw mm7, mm1 ; mm7=data3+data4=tmp3 + paddw mm4, mm6 ; mm4=data2+data5=tmp2 + psubw mm2, mm1 ; mm2=data3-data4=tmp4 + psubw mm3, mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1, mm5 + movq mm6, mm0 + paddw mm5, mm7 ; mm5=tmp10 + paddw mm0, mm4 ; mm0=tmp11 + psubw mm1, mm7 ; mm1=tmp13 + psubw mm6, mm4 ; mm6=tmp12 + + movq mm7, mm5 + paddw mm5, mm0 ; mm5=tmp10+tmp11 + psubw mm7, mm0 ; mm7=tmp10-tmp11 + + paddw mm5, [GOTOFF(ebx,PW_DESCALE_P2X)] + paddw mm7, [GOTOFF(ebx,PW_DESCALE_P2X)] + psraw mm5, PASS1_BITS ; mm5=data0 + psraw mm7, PASS1_BITS ; mm7=data4 + + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movq mm4, mm1 ; mm1=tmp13 + movq mm0, mm1 + punpcklwd mm4, mm6 ; mm6=tmp12 + punpckhwd mm0, mm6 + movq mm1, mm4 + movq mm6, mm0 + pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L + pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H + pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L + pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H + + paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm0, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm4, DESCALE_P2 + psrad mm0, DESCALE_P2 + paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm6, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm1, DESCALE_P2 + psrad mm6, DESCALE_P2 + + packssdw mm4, mm0 ; mm4=data2 + packssdw mm1, mm6 ; mm1=data6 + + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1 + + ; -- Odd part + + movq mm5, MMWORD [wk(0)] ; mm5=tmp6 + movq mm7, MMWORD [wk(1)] ; mm7=tmp7 + + movq mm0, mm2 ; mm2=tmp4 + movq mm6, mm3 ; mm3=tmp5 + paddw mm0, mm5 ; mm0=z3 + paddw mm6, mm7 ; mm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm4, mm0 + movq mm1, mm0 + punpcklwd mm4, mm6 + punpckhwd mm1, mm6 + movq mm0, mm4 + movq mm6, mm1 + pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L + pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H + pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L + pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H + + movq MMWORD [wk(0)], mm4 ; wk(0)=z3L + movq MMWORD [wk(1)], mm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movq mm4, mm2 + movq mm1, mm2 + punpcklwd mm4, mm7 + punpckhwd mm1, mm7 + movq mm2, mm4 + movq mm7, mm1 + pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L + pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H + pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L + pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H + + paddd mm4, MMWORD [wk(0)] ; mm4=data7L + paddd mm1, MMWORD [wk(1)] ; mm1=data7H + paddd mm2, mm0 ; mm2=data1L + paddd mm7, mm6 ; mm7=data1H + + paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm4, DESCALE_P2 + psrad mm1, DESCALE_P2 + paddd mm2, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm2, DESCALE_P2 + psrad mm7, DESCALE_P2 + + packssdw mm4, mm1 ; mm4=data7 + packssdw mm2, mm7 ; mm2=data1 + + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 + + movq mm1, mm3 + movq mm7, mm3 + punpcklwd mm1, mm5 + punpckhwd mm7, mm5 + movq mm3, mm1 + movq mm5, mm7 + pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L + pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H + pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L + pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H + + paddd mm1, mm0 ; mm1=data5L + paddd mm7, mm6 ; mm7=data5H + paddd mm3, MMWORD [wk(0)] ; mm3=data3L + paddd mm5, MMWORD [wk(1)] ; mm5=data3H + + paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm1, DESCALE_P2 + psrad mm7, DESCALE_P2 + paddd mm3, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm5, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm3, DESCALE_P2 + psrad mm5, DESCALE_P2 + + packssdw mm1, mm7 ; mm1=data5 + packssdw mm3, mm5 ; mm3=data3 + + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 + + add edx, byte 4*SIZEOF_DCTELEM + dec ecx + jnz near .columnloop + + emms ; empty MMX state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jfdctint-sse2.asm b/media/libjpeg/simd/i386/jfdctint-sse2.asm new file mode 100644 index 0000000000..6f8e18cb9d --- /dev/null +++ b/media/libjpeg/simd/i386/jfdctint-sse2.asm @@ -0,0 +1,633 @@ +; +; jfdctint.asm - accurate integer FDCT (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_islow_sse2) + +EXTN(jconst_fdct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847) +PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) +PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_sse2(DCTELEM *data) +; + +%define data(b) (b) + 8 ; DCTELEM *data + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 6 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_islow_sse2) + +EXTN(jsimd_fdct_islow_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6, xmm1 + movdqa xmm3, xmm0 + psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2, xmm1 + movdqa xmm5, xmm7 + paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm0, xmm6 + paddw xmm3, xmm1 ; xmm3=tmp10 + paddw xmm6, xmm7 ; xmm6=tmp11 + psubw xmm4, xmm1 ; xmm4=tmp13 + psubw xmm0, xmm7 ; xmm0=tmp12 + + movdqa xmm1, xmm3 + paddw xmm3, xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1, xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3, PASS1_BITS ; xmm3=data0 + psllw xmm1, PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7, xmm4 ; xmm4=tmp13 + movdqa xmm6, xmm4 + punpcklwd xmm7, xmm0 ; xmm0=tmp12 + punpckhwd xmm6, xmm0 + movdqa xmm4, xmm7 + movdqa xmm0, xmm6 + pmaddwd xmm7, [GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L + pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H + pmaddwd xmm4, [GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L + pmaddwd xmm0, [GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H + + paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7, DESCALE_P1 + psrad xmm6, DESCALE_P1 + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm7, xmm6 ; xmm7=data2 + packssdw xmm4, xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6, xmm2 ; xmm2=tmp4 + movdqa xmm0, xmm5 ; xmm5=tmp5 + paddw xmm6, xmm3 ; xmm6=z3 + paddw xmm0, xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7, xmm6 + movdqa xmm4, xmm6 + punpcklwd xmm7, xmm0 + punpckhwd xmm4, xmm0 + movdqa xmm6, xmm7 + movdqa xmm0, xmm4 + pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L + pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H + pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L + pmaddwd xmm0, [GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + punpcklwd xmm7, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm2, xmm7 + movdqa xmm1, xmm4 + pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L + pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H + pmaddwd xmm2, [GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L + pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2, xmm6 ; xmm2=data1L + paddd xmm1, xmm0 ; xmm1=data1H + + paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7, DESCALE_P1 + psrad xmm4, DESCALE_P1 + paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm2, DESCALE_P1 + psrad xmm1, DESCALE_P1 + + packssdw xmm7, xmm4 ; xmm7=data7 + packssdw xmm2, xmm1 ; xmm2=data1 + + movdqa xmm4, xmm5 + movdqa xmm1, xmm5 + punpcklwd xmm4, xmm3 + punpckhwd xmm1, xmm3 + movdqa xmm5, xmm4 + movdqa xmm3, xmm1 + pmaddwd xmm4, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L + pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H + pmaddwd xmm5, [GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L + pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H + + paddd xmm4, xmm6 ; xmm4=data5L + paddd xmm1, xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4, DESCALE_P1 + psrad xmm1, DESCALE_P1 + paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm5, DESCALE_P1 + psrad xmm3, DESCALE_P1 + + packssdw xmm4, xmm1 ; xmm4=data5 + packssdw xmm5, xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + +; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5, xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2, xmm5 + movdqa xmm7, xmm6 + psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0, xmm5 + movdqa xmm3, xmm4 + paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1, xmm7 + movdqa xmm6, xmm2 + paddw xmm7, xmm5 ; xmm7=tmp10 + paddw xmm2, xmm4 ; xmm2=tmp11 + psubw xmm1, xmm5 ; xmm1=tmp13 + psubw xmm6, xmm4 ; xmm6=tmp12 + + movdqa xmm5, xmm7 + paddw xmm7, xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5, xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)] + paddw xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)] + psraw xmm7, PASS1_BITS ; xmm7=data0 + psraw xmm5, PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4, xmm1 ; xmm1=tmp13 + movdqa xmm2, xmm1 + punpcklwd xmm4, xmm6 ; xmm6=tmp12 + punpckhwd xmm2, xmm6 + movdqa xmm1, xmm4 + movdqa xmm6, xmm2 + pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L + pmaddwd xmm2, [GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H + pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L + pmaddwd xmm6, [GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H + + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4, DESCALE_P2 + psrad xmm2, DESCALE_P2 + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1, DESCALE_P2 + psrad xmm6, DESCALE_P2 + + packssdw xmm4, xmm2 ; xmm4=data2 + packssdw xmm1, xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2, xmm0 ; xmm0=tmp4 + movdqa xmm6, xmm3 ; xmm3=tmp5 + paddw xmm2, xmm7 ; xmm2=z3 + paddw xmm6, xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4, xmm2 + movdqa xmm1, xmm2 + punpcklwd xmm4, xmm6 + punpckhwd xmm1, xmm6 + movdqa xmm2, xmm4 + movdqa xmm6, xmm1 + pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L + pmaddwd xmm1, [GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H + pmaddwd xmm2, [GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L + pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm4, xmm5 + punpckhwd xmm1, xmm5 + movdqa xmm0, xmm4 + movdqa xmm5, xmm1 + pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L + pmaddwd xmm1, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H + pmaddwd xmm0, [GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L + pmaddwd xmm5, [GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0, xmm2 ; xmm0=data1L + paddd xmm5, xmm6 ; xmm5=data1H + + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm0, DESCALE_P2 + psrad xmm5, DESCALE_P2 + + packssdw xmm4, xmm1 ; xmm4=data7 + packssdw xmm0, xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1, xmm3 + movdqa xmm5, xmm3 + punpcklwd xmm1, xmm7 + punpckhwd xmm5, xmm7 + movdqa xmm3, xmm1 + movdqa xmm7, xmm5 + pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L + pmaddwd xmm5, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H + pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L + pmaddwd xmm7, [GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H + + paddd xmm1, xmm2 ; xmm1=data5L + paddd xmm5, xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1, DESCALE_P2 + psrad xmm5, DESCALE_P2 + paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm3, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm1, xmm5 ; xmm1=data5 + packssdw xmm3, xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3 + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jidctflt-3dn.asm b/media/libjpeg/simd/i386/jidctflt-3dn.asm new file mode 100644 index 0000000000..87951910d8 --- /dev/null +++ b/media/libjpeg/simd/i386/jidctflt-3dn.asm @@ -0,0 +1,451 @@ +; +; jidctflt.asm - floating-point IDCT (3DNow! & MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_float_3dnow) + +EXTN(jconst_idct_float_3dnow): + +PD_1_414 times 2 dd 1.414213562373095048801689 +PD_1_847 times 2 dd 1.847759065022573512256366 +PD_1_082 times 2 dd 1.082392200292393968799446 +PD_2_613 times 2 dd 2.613125929752753055713286 +PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_float_3dnow) + +EXTN(jsimd_idct_float_3dnow): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/2 ; ctr + alignx 16, 7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + pushpic ebx ; save GOT address + mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] + or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] + or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] + or eax, ebx + poppic ebx ; restore GOT address + jnz short .columnDCT + + ; -- AC terms all zero + + movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm0, mm0 + psrad mm0, (DWORD_BIT-WORD_BIT) + pi2fd mm0, mm0 + + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm1, mm0 + punpckldq mm0, mm0 + punpckhdq mm1, mm1 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 + jmp near .nextcolumn + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] + movd mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + movd mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] + movd mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm0, mm0 + punpcklwd mm1, mm1 + psrad mm0, (DWORD_BIT-WORD_BIT) + psrad mm1, (DWORD_BIT-WORD_BIT) + pi2fd mm0, mm0 + pi2fd mm1, mm1 + + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + punpcklwd mm2, mm2 + punpcklwd mm3, mm3 + psrad mm2, (DWORD_BIT-WORD_BIT) + psrad mm3, (DWORD_BIT-WORD_BIT) + pi2fd mm2, mm2 + pi2fd mm3, mm3 + + pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm4, mm0 + movq mm5, mm1 + pfsub mm0, mm2 ; mm0=tmp11 + pfsub mm1, mm3 + pfadd mm4, mm2 ; mm4=tmp10 + pfadd mm5, mm3 ; mm5=tmp13 + + pfmul mm1, [GOTOFF(ebx,PD_1_414)] + pfsub mm1, mm5 ; mm1=tmp12 + + movq mm6, mm4 + movq mm7, mm0 + pfsub mm4, mm5 ; mm4=tmp3 + pfsub mm0, mm1 ; mm0=tmp2 + pfadd mm6, mm5 ; mm6=tmp0 + pfadd mm7, mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; tmp3 + movq MMWORD [wk(0)], mm0 ; tmp2 + + ; -- Odd part + + movd mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + movd mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] + movd mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] + movd mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm2, mm2 + punpcklwd mm3, mm3 + psrad mm2, (DWORD_BIT-WORD_BIT) + psrad mm3, (DWORD_BIT-WORD_BIT) + pi2fd mm2, mm2 + pi2fd mm3, mm3 + + pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + punpcklwd mm5, mm5 + punpcklwd mm1, mm1 + psrad mm5, (DWORD_BIT-WORD_BIT) + psrad mm1, (DWORD_BIT-WORD_BIT) + pi2fd mm5, mm5 + pi2fd mm1, mm1 + + pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm4, mm2 + movq mm0, mm5 + pfadd mm2, mm1 ; mm2=z11 + pfadd mm5, mm3 ; mm5=z13 + pfsub mm4, mm1 ; mm4=z12 + pfsub mm0, mm3 ; mm0=z10 + + movq mm1, mm2 + pfsub mm2, mm5 + pfadd mm1, mm5 ; mm1=tmp7 + + pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 + + movq mm3, mm0 + pfadd mm0, mm4 + pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5 + pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) + pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) + pfsubr mm3, mm0 ; mm3=tmp12 + pfsub mm4, mm0 ; mm4=tmp10 + + ; -- Final output stage + + pfsub mm3, mm1 ; mm3=tmp6 + movq mm5, mm6 + movq mm0, mm7 + pfadd mm6, mm1 ; mm6=data0=(00 01) + pfadd mm7, mm3 ; mm7=data1=(10 11) + pfsub mm5, mm1 ; mm5=data7=(70 71) + pfsub mm0, mm3 ; mm0=data6=(60 61) + pfsub mm2, mm3 ; mm2=tmp5 + + movq mm1, mm6 ; transpose coefficients + punpckldq mm6, mm7 ; mm6=(00 10) + punpckhdq mm1, mm7 ; mm1=(01 11) + movq mm3, mm0 ; transpose coefficients + punpckldq mm0, mm5 ; mm0=(60 70) + punpckhdq mm3, mm5 ; mm3=(61 71) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3 + + movq mm7, MMWORD [wk(0)] ; mm7=tmp2 + movq mm5, MMWORD [wk(1)] ; mm5=tmp3 + + pfadd mm4, mm2 ; mm4=tmp4 + movq mm6, mm7 + movq mm1, mm5 + pfadd mm7, mm2 ; mm7=data2=(20 21) + pfadd mm5, mm4 ; mm5=data4=(40 41) + pfsub mm6, mm2 ; mm6=data5=(50 51) + pfsub mm1, mm4 ; mm1=data3=(30 31) + + movq mm0, mm7 ; transpose coefficients + punpckldq mm7, mm1 ; mm7=(20 30) + punpckhdq mm0, mm1 ; mm0=(21 31) + movq mm3, mm5 ; transpose coefficients + punpckldq mm5, mm6 ; mm5=(40 50) + punpckhdq mm3, mm6 ; mm3=(41 51) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3 + +.nextcolumn: + add esi, byte 2*SIZEOF_JCOEF ; coef_block + add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/2 ; ctr + alignx 16, 7 +.rowloop: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movq mm4, mm0 + movq mm5, mm1 + pfsub mm0, mm2 ; mm0=tmp11 + pfsub mm1, mm3 + pfadd mm4, mm2 ; mm4=tmp10 + pfadd mm5, mm3 ; mm5=tmp13 + + pfmul mm1, [GOTOFF(ebx,PD_1_414)] + pfsub mm1, mm5 ; mm1=tmp12 + + movq mm6, mm4 + movq mm7, mm0 + pfsub mm4, mm5 ; mm4=tmp3 + pfsub mm0, mm1 ; mm0=tmp2 + pfadd mm6, mm5 ; mm6=tmp0 + pfadd mm7, mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; tmp3 + movq MMWORD [wk(0)], mm0 ; tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movq mm4, mm2 + movq mm0, mm5 + pfadd mm2, mm1 ; mm2=z11 + pfadd mm5, mm3 ; mm5=z13 + pfsub mm4, mm1 ; mm4=z12 + pfsub mm0, mm3 ; mm0=z10 + + movq mm1, mm2 + pfsub mm2, mm5 + pfadd mm1, mm5 ; mm1=tmp7 + + pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 + + movq mm3, mm0 + pfadd mm0, mm4 + pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5 + pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) + pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) + pfsubr mm3, mm0 ; mm3=tmp12 + pfsub mm4, mm0 ; mm4=tmp10 + + ; -- Final output stage + + pfsub mm3, mm1 ; mm3=tmp6 + movq mm5, mm6 + movq mm0, mm7 + pfadd mm6, mm1 ; mm6=data0=(00 10) + pfadd mm7, mm3 ; mm7=data1=(01 11) + pfsub mm5, mm1 ; mm5=data7=(07 17) + pfsub mm0, mm3 ; mm0=data6=(06 16) + pfsub mm2, mm3 ; mm2=tmp5 + + movq mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC] + pcmpeqd mm3, mm3 + psrld mm3, WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000} + + pfadd mm6, mm1 ; mm6=roundint(data0/8)=(00 ** 10 **) + pfadd mm7, mm1 ; mm7=roundint(data1/8)=(01 ** 11 **) + pfadd mm0, mm1 ; mm0=roundint(data6/8)=(06 ** 16 **) + pfadd mm5, mm1 ; mm5=roundint(data7/8)=(07 ** 17 **) + + pand mm6, mm3 ; mm6=(00 -- 10 --) + pslld mm7, WORD_BIT ; mm7=(-- 01 -- 11) + pand mm0, mm3 ; mm0=(06 -- 16 --) + pslld mm5, WORD_BIT ; mm5=(-- 07 -- 17) + por mm6, mm7 ; mm6=(00 01 10 11) + por mm0, mm5 ; mm0=(06 07 16 17) + + movq mm1, MMWORD [wk(0)] ; mm1=tmp2 + movq mm3, MMWORD [wk(1)] ; mm3=tmp3 + + pfadd mm4, mm2 ; mm4=tmp4 + movq mm7, mm1 + movq mm5, mm3 + pfadd mm1, mm2 ; mm1=data2=(02 12) + pfadd mm3, mm4 ; mm3=data4=(04 14) + pfsub mm7, mm2 ; mm7=data5=(05 15) + pfsub mm5, mm4 ; mm5=data3=(03 13) + + movq mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC] + pcmpeqd mm4, mm4 + psrld mm4, WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000} + + pfadd mm3, mm2 ; mm3=roundint(data4/8)=(04 ** 14 **) + pfadd mm7, mm2 ; mm7=roundint(data5/8)=(05 ** 15 **) + pfadd mm1, mm2 ; mm1=roundint(data2/8)=(02 ** 12 **) + pfadd mm5, mm2 ; mm5=roundint(data3/8)=(03 ** 13 **) + + pand mm3, mm4 ; mm3=(04 -- 14 --) + pslld mm7, WORD_BIT ; mm7=(-- 05 -- 15) + pand mm1, mm4 ; mm1=(02 -- 12 --) + pslld mm5, WORD_BIT ; mm5=(-- 03 -- 13) + por mm3, mm7 ; mm3=(04 05 14 15) + por mm1, mm5 ; mm1=(02 03 12 13) + + movq mm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP] + + packsswb mm6, mm3 ; mm6=(00 01 10 11 04 05 14 15) + packsswb mm1, mm0 ; mm1=(02 03 12 13 06 07 16 17) + paddb mm6, mm2 + paddb mm1, mm2 + + movq mm4, mm6 ; transpose coefficients(phase 2) + punpcklwd mm6, mm1 ; mm6=(00 01 02 03 10 11 12 13) + punpckhwd mm4, mm1 ; mm4=(04 05 06 07 14 15 16 17) + + movq mm7, mm6 ; transpose coefficients(phase 3) + punpckldq mm6, mm4 ; mm6=(00 01 02 03 04 05 06 07) + punpckhdq mm7, mm4 ; mm7=(10 11 12 13 14 15 16 17) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 + + poppic ebx ; restore GOT address + + add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 2*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jidctflt-sse.asm b/media/libjpeg/simd/i386/jidctflt-sse.asm new file mode 100644 index 0000000000..b27ecfdf46 --- /dev/null +++ b/media/libjpeg/simd/i386/jidctflt-sse.asm @@ -0,0 +1,571 @@ +; +; jidctflt.asm - floating-point IDCT (SSE & MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_float_sse) + +EXTN(jconst_idct_float_sse): + +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_0_125 times 4 dd 0.125 ; 1/8 +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_sse(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_float_sse) + +EXTN(jsimd_idct_float_sse): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1, mm0 + packsswb mm1, mm1 + movd eax, mm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm1, mm0 ; mm1=(** 02 ** 03) + punpcklwd mm0, mm0 ; mm0=(00 00 01 01) + psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03) + psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) + cvtpi2ps xmm3, mm1 ; xmm3=(02 03 ** **) + cvtpi2ps xmm0, mm0 ; xmm0=(00 01 ** **) + movlhps xmm0, xmm3 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1, xmm0 + movaps xmm2, xmm0 + movaps xmm3, xmm0 + + shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) + shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) + shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) + shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm4, mm0 ; mm4=(** 02 ** 03) + punpcklwd mm0, mm0 ; mm0=(00 00 01 01) + punpckhwd mm5, mm1 ; mm5=(** 22 ** 23) + punpcklwd mm1, mm1 ; mm1=(20 20 21 21) + + psrad mm4, (DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03) + psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) + cvtpi2ps xmm4, mm4 ; xmm4=(02 03 ** **) + cvtpi2ps xmm0, mm0 ; xmm0=(00 01 ** **) + psrad mm5, (DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23) + psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21) + cvtpi2ps xmm5, mm5 ; xmm5=(22 23 ** **) + cvtpi2ps xmm1, mm1 ; xmm1=(20 21 ** **) + + punpckhwd mm6, mm2 ; mm6=(** 42 ** 43) + punpcklwd mm2, mm2 ; mm2=(40 40 41 41) + punpckhwd mm7, mm3 ; mm7=(** 62 ** 63) + punpcklwd mm3, mm3 ; mm3=(60 60 61 61) + + psrad mm6, (DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43) + psrad mm2, (DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41) + cvtpi2ps xmm6, mm6 ; xmm6=(42 43 ** **) + cvtpi2ps xmm2, mm2 ; xmm2=(40 41 ** **) + psrad mm7, (DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63) + psrad mm3, (DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61) + cvtpi2ps xmm7, mm7 ; xmm7=(62 63 ** **) + cvtpi2ps xmm3, mm3 ; xmm3=(60 61 ** **) + + movlhps xmm0, xmm4 ; xmm0=in0=(00 01 02 03) + movlhps xmm1, xmm5 ; xmm1=in2=(20 21 22 23) + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movlhps xmm2, xmm6 ; xmm2=in4=(40 41 42 43) + movlhps xmm3, xmm7 ; xmm3=in6=(60 61 62 63) + mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [GOTOFF(ebx,PD_1_414)] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm6, mm4 ; mm6=(** 12 ** 13) + punpcklwd mm4, mm4 ; mm4=(10 10 11 11) + punpckhwd mm2, mm0 ; mm2=(** 32 ** 33) + punpcklwd mm0, mm0 ; mm0=(30 30 31 31) + + psrad mm6, (DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13) + psrad mm4, (DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11) + cvtpi2ps xmm4, mm6 ; xmm4=(12 13 ** **) + cvtpi2ps xmm2, mm4 ; xmm2=(10 11 ** **) + psrad mm2, (DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33) + psrad mm0, (DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31) + cvtpi2ps xmm0, mm2 ; xmm0=(32 33 ** **) + cvtpi2ps xmm3, mm0 ; xmm3=(30 31 ** **) + + punpckhwd mm7, mm5 ; mm7=(** 52 ** 53) + punpcklwd mm5, mm5 ; mm5=(50 50 51 51) + punpckhwd mm3, mm1 ; mm3=(** 72 ** 73) + punpcklwd mm1, mm1 ; mm1=(70 70 71 71) + + movlhps xmm2, xmm4 ; xmm2=in1=(10 11 12 13) + movlhps xmm3, xmm0 ; xmm3=in3=(30 31 32 33) + + psrad mm7, (DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53) + psrad mm5, (DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51) + cvtpi2ps xmm4, mm7 ; xmm4=(52 53 ** **) + cvtpi2ps xmm5, mm5 ; xmm5=(50 51 ** **) + psrad mm3, (DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73) + psrad mm1, (DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71) + cvtpi2ps xmm0, mm3 ; xmm0=(72 73 ** **) + cvtpi2ps xmm1, mm1 ; xmm1=(70 71 ** **) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movlhps xmm5, xmm4 ; xmm5=in5=(50 51 52 53) + movlhps xmm1, xmm0 ; xmm1=in7=(70 71 72 73) + mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) + movaps xmm3, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm0, xmm7 + movaps xmm3, xmm5 + addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2, xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) + movaps xmm4, xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) + movaps xmm0, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6, xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) + movaps xmm3, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [GOTOFF(ebx,PD_1_414)] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, [GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125] + + mulps xmm6, xmm1 ; descale(1/8) + mulps xmm7, xmm1 ; descale(1/8) + mulps xmm5, xmm1 ; descale(1/8) + mulps xmm0, xmm1 ; descale(1/8) + + movhlps xmm3, xmm6 + movhlps xmm1, xmm7 + cvtps2pi mm0, xmm6 ; round to int32, mm0=data0L=(00 10) + cvtps2pi mm1, xmm7 ; round to int32, mm1=data1L=(01 11) + cvtps2pi mm2, xmm3 ; round to int32, mm2=data0H=(20 30) + cvtps2pi mm3, xmm1 ; round to int32, mm3=data1H=(21 31) + packssdw mm0, mm2 ; mm0=data0=(00 10 20 30) + packssdw mm1, mm3 ; mm1=data1=(01 11 21 31) + + movhlps xmm6, xmm5 + movhlps xmm7, xmm0 + cvtps2pi mm4, xmm5 ; round to int32, mm4=data7L=(07 17) + cvtps2pi mm5, xmm0 ; round to int32, mm5=data6L=(06 16) + cvtps2pi mm6, xmm6 ; round to int32, mm6=data7H=(27 37) + cvtps2pi mm7, xmm7 ; round to int32, mm7=data6H=(26 36) + packssdw mm4, mm6 ; mm4=data7=(07 17 27 37) + packssdw mm5, mm7 ; mm5=data6=(06 16 26 36) + + packsswb mm0, mm5 ; mm0=(00 10 20 30 06 16 26 36) + packsswb mm1, mm4 ; mm1=(01 11 21 31 07 17 27 37) + + movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2 + movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movaps xmm6, [GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125] + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm5, xmm3 + movaps xmm0, xmm1 + addps xmm3, xmm2 ; xmm3=data2=(02 12 22 32) + addps xmm1, xmm4 ; xmm1=data4=(04 14 24 34) + subps xmm5, xmm2 ; xmm5=data5=(05 15 25 35) + subps xmm0, xmm4 ; xmm0=data3=(03 13 23 33) + + mulps xmm3, xmm6 ; descale(1/8) + mulps xmm1, xmm6 ; descale(1/8) + mulps xmm5, xmm6 ; descale(1/8) + mulps xmm0, xmm6 ; descale(1/8) + + movhlps xmm7, xmm3 + movhlps xmm2, xmm1 + cvtps2pi mm2, xmm3 ; round to int32, mm2=data2L=(02 12) + cvtps2pi mm3, xmm1 ; round to int32, mm3=data4L=(04 14) + cvtps2pi mm6, xmm7 ; round to int32, mm6=data2H=(22 32) + cvtps2pi mm7, xmm2 ; round to int32, mm7=data4H=(24 34) + packssdw mm2, mm6 ; mm2=data2=(02 12 22 32) + packssdw mm3, mm7 ; mm3=data4=(04 14 24 34) + + movhlps xmm4, xmm5 + movhlps xmm6, xmm0 + cvtps2pi mm5, xmm5 ; round to int32, mm5=data5L=(05 15) + cvtps2pi mm4, xmm0 ; round to int32, mm4=data3L=(03 13) + cvtps2pi mm6, xmm4 ; round to int32, mm6=data5H=(25 35) + cvtps2pi mm7, xmm6 ; round to int32, mm7=data3H=(23 33) + packssdw mm5, mm6 ; mm5=data5=(05 15 25 35) + packssdw mm4, mm7 ; mm4=data3=(03 13 23 33) + + movq mm6, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] + + packsswb mm2, mm3 ; mm2=(02 12 22 32 04 14 24 34) + packsswb mm4, mm5 ; mm4=(03 13 23 33 05 15 25 35) + + paddb mm0, mm6 + paddb mm1, mm6 + paddb mm2, mm6 + paddb mm4, mm6 + + movq mm7, mm0 ; transpose coefficients(phase 1) + punpcklbw mm0, mm1 ; mm0=(00 01 10 11 20 21 30 31) + punpckhbw mm7, mm1 ; mm7=(06 07 16 17 26 27 36 37) + movq mm3, mm2 ; transpose coefficients(phase 1) + punpcklbw mm2, mm4 ; mm2=(02 03 12 13 22 23 32 33) + punpckhbw mm3, mm4 ; mm3=(04 05 14 15 24 25 34 35) + + movq mm5, mm0 ; transpose coefficients(phase 2) + punpcklwd mm0, mm2 ; mm0=(00 01 02 03 10 11 12 13) + punpckhwd mm5, mm2 ; mm5=(20 21 22 23 30 31 32 33) + movq mm6, mm3 ; transpose coefficients(phase 2) + punpcklwd mm3, mm7 ; mm3=(04 05 06 07 14 15 16 17) + punpckhwd mm6, mm7 ; mm6=(24 25 26 27 34 35 36 37) + + movq mm1, mm0 ; transpose coefficients(phase 3) + punpckldq mm0, mm3 ; mm0=(00 01 02 03 04 05 06 07) + punpckhdq mm1, mm3 ; mm1=(10 11 12 13 14 15 16 17) + movq mm4, mm5 ; transpose coefficients(phase 3) + punpckldq mm5, mm6 ; mm5=(20 21 22 23 24 25 26 27) + punpckhdq mm4, mm6 ; mm4=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jidctflt-sse2.asm b/media/libjpeg/simd/i386/jidctflt-sse2.asm new file mode 100644 index 0000000000..c646eaef76 --- /dev/null +++ b/media/libjpeg/simd/i386/jidctflt-sse2.asm @@ -0,0 +1,497 @@ +; +; jidctflt.asm - floating-point IDCT (SSE & SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_float_sse2) + +EXTN(jconst_idct_float_sse2): + +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_float_sse2) + +EXTN(jsimd_idct_float_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1, xmm2 + por xmm3, xmm4 + por xmm5, xmm6 + por xmm1, xmm3 + por xmm5, xmm7 + por xmm1, xmm5 + packsswb xmm1, xmm1 + movd eax, xmm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1, xmm0 + movaps xmm2, xmm0 + movaps xmm3, xmm0 + + shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) + shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) + shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) + shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [GOTOFF(ebx,PD_1_414)] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) + movaps xmm3, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm0, xmm7 + movaps xmm3, xmm5 + addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2, xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) + movaps xmm4, xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) + movaps xmm0, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6, xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) + movaps xmm3, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [GOTOFF(ebx,PD_1_414)] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] + pcmpeqd xmm3, xmm3 + psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm7, xmm1 + movaps xmm5, xmm3 + addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] + pcmpeqd xmm4, xmm4 + psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6, xmm2 + paddb xmm1, xmm2 + + movdqa xmm4, xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jidctfst-mmx.asm b/media/libjpeg/simd/i386/jidctfst-mmx.asm new file mode 100644 index 0000000000..24622d4369 --- /dev/null +++ b/media/libjpeg/simd/i386/jidctfst-mmx.asm @@ -0,0 +1,499 @@ +; +; jidctfst.asm - fast integer IDCT (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the inverse DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jidctfst.c; see the jidctfst.c +; for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 + +%if IFAST_SCALE_BITS != PASS1_BITS +%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." +%endif + +%if CONST_BITS == 8 +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 32 + GLOBAL_DATA(jconst_idct_ifast_mmx) + +EXTN(jconst_idct_ifast_mmx): + +PW_F1414 times 4 dw F_1_414 << CONST_SHIFT +PW_F1847 times 4 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 4 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_ifast_mmx) + +EXTN(jsimd_idct_ifast_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1, mm0 + packsswb mm1, mm1 + movd eax, mm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm2, mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0, mm0 ; mm0=(00 00 01 01) + punpckhwd mm2, mm2 ; mm2=(02 02 03 03) + + movq mm1, mm0 + punpckldq mm0, mm0 ; mm0=(00 00 00 00) + punpckhdq mm1, mm1 ; mm1=(01 01 01 01) + movq mm3, mm2 + punpckldq mm2, mm2 ; mm2=(02 02 02 02) + punpckhdq mm3, mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm4, mm0 + movq mm5, mm1 + psubw mm0, mm2 ; mm0=tmp11 + psubw mm1, mm3 + paddw mm4, mm2 ; mm4=tmp10 + paddw mm5, mm3 ; mm5=tmp13 + + psllw mm1, PRE_MULTIPLY_SCALE_BITS + pmulhw mm1, [GOTOFF(ebx,PW_F1414)] + psubw mm1, mm5 ; mm1=tmp12 + + movq mm6, mm4 + movq mm7, mm0 + psubw mm4, mm5 ; mm4=tmp3 + psubw mm0, mm1 ; mm0=tmp2 + paddw mm6, mm5 ; mm6=tmp0 + paddw mm7, mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 + movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm4, mm2 + movq mm0, mm5 + psubw mm2, mm1 ; mm2=z12 + psubw mm5, mm3 ; mm5=z10 + paddw mm4, mm1 ; mm4=z11 + paddw mm0, mm3 ; mm0=z13 + + movq mm1, mm5 ; mm1=z10(unscaled) + psllw mm2, PRE_MULTIPLY_SCALE_BITS + psllw mm5, PRE_MULTIPLY_SCALE_BITS + + movq mm3, mm4 + psubw mm4, mm0 + paddw mm3, mm0 ; mm3=tmp7 + + psllw mm4, PRE_MULTIPLY_SCALE_BITS + pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movq mm0, mm5 + paddw mm5, mm2 + pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5 + pmulhw mm0, [GOTOFF(ebx,PW_MF1613)] + pmulhw mm2, [GOTOFF(ebx,PW_F1082)] + psubw mm0, mm1 + psubw mm2, mm5 ; mm2=tmp10 + paddw mm0, mm5 ; mm0=tmp12 + + ; -- Final output stage + + psubw mm0, mm3 ; mm0=tmp6 + movq mm1, mm6 + movq mm5, mm7 + paddw mm6, mm3 ; mm6=data0=(00 01 02 03) + paddw mm7, mm0 ; mm7=data1=(10 11 12 13) + psubw mm1, mm3 ; mm1=data7=(70 71 72 73) + psubw mm5, mm0 ; mm5=data6=(60 61 62 63) + psubw mm4, mm0 ; mm4=tmp5 + + movq mm3, mm6 ; transpose coefficients(phase 1) + punpcklwd mm6, mm7 ; mm6=(00 10 01 11) + punpckhwd mm3, mm7 ; mm3=(02 12 03 13) + movq mm0, mm5 ; transpose coefficients(phase 1) + punpcklwd mm5, mm1 ; mm5=(60 70 61 71) + punpckhwd mm0, mm1 ; mm0=(62 72 63 73) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp2 + movq mm1, MMWORD [wk(1)] ; mm1=tmp3 + + movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71) + movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73) + + paddw mm2, mm4 ; mm2=tmp4 + movq mm5, mm7 + movq mm0, mm1 + paddw mm7, mm4 ; mm7=data2=(20 21 22 23) + paddw mm1, mm2 ; mm1=data4=(40 41 42 43) + psubw mm5, mm4 ; mm5=data5=(50 51 52 53) + psubw mm0, mm2 ; mm0=data3=(30 31 32 33) + + movq mm4, mm7 ; transpose coefficients(phase 1) + punpcklwd mm7, mm0 ; mm7=(20 30 21 31) + punpckhwd mm4, mm0 ; mm4=(22 32 23 33) + movq mm2, mm1 ; transpose coefficients(phase 1) + punpcklwd mm1, mm5 ; mm1=(40 50 41 51) + punpckhwd mm2, mm5 ; mm2=(42 52 43 53) + + movq mm0, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm7 ; mm6=(00 10 20 30) + punpckhdq mm0, mm7 ; mm0=(01 11 21 31) + movq mm5, mm3 ; transpose coefficients(phase 2) + punpckldq mm3, mm4 ; mm3=(02 12 22 32) + punpckhdq mm5, mm4 ; mm5=(03 13 23 33) + + movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71) + movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 + + movq mm6, mm1 ; transpose coefficients(phase 2) + punpckldq mm1, mm7 ; mm1=(40 50 60 70) + punpckhdq mm6, mm7 ; mm6=(41 51 61 71) + movq mm0, mm2 ; transpose coefficients(phase 2) + punpckldq mm2, mm4 ; mm2=(42 52 62 72) + punpckhdq mm0, mm4 ; mm0=(43 53 63 73) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.rowloop: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + movq mm4, mm0 + movq mm5, mm1 + psubw mm0, mm2 ; mm0=tmp11 + psubw mm1, mm3 + paddw mm4, mm2 ; mm4=tmp10 + paddw mm5, mm3 ; mm5=tmp13 + + psllw mm1, PRE_MULTIPLY_SCALE_BITS + pmulhw mm1, [GOTOFF(ebx,PW_F1414)] + psubw mm1, mm5 ; mm1=tmp12 + + movq mm6, mm4 + movq mm7, mm0 + psubw mm4, mm5 ; mm4=tmp3 + psubw mm0, mm1 ; mm0=tmp2 + paddw mm6, mm5 ; mm6=tmp0 + paddw mm7, mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 + movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm4, mm2 + movq mm0, mm5 + psubw mm2, mm1 ; mm2=z12 + psubw mm5, mm3 ; mm5=z10 + paddw mm4, mm1 ; mm4=z11 + paddw mm0, mm3 ; mm0=z13 + + movq mm1, mm5 ; mm1=z10(unscaled) + psllw mm2, PRE_MULTIPLY_SCALE_BITS + psllw mm5, PRE_MULTIPLY_SCALE_BITS + + movq mm3, mm4 + psubw mm4, mm0 + paddw mm3, mm0 ; mm3=tmp7 + + psllw mm4, PRE_MULTIPLY_SCALE_BITS + pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movq mm0, mm5 + paddw mm5, mm2 + pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5 + pmulhw mm0, [GOTOFF(ebx,PW_MF1613)] + pmulhw mm2, [GOTOFF(ebx,PW_F1082)] + psubw mm0, mm1 + psubw mm2, mm5 ; mm2=tmp10 + paddw mm0, mm5 ; mm0=tmp12 + + ; -- Final output stage + + psubw mm0, mm3 ; mm0=tmp6 + movq mm1, mm6 + movq mm5, mm7 + paddw mm6, mm3 ; mm6=data0=(00 10 20 30) + paddw mm7, mm0 ; mm7=data1=(01 11 21 31) + psraw mm6, (PASS1_BITS+3) ; descale + psraw mm7, (PASS1_BITS+3) ; descale + psubw mm1, mm3 ; mm1=data7=(07 17 27 37) + psubw mm5, mm0 ; mm5=data6=(06 16 26 36) + psraw mm1, (PASS1_BITS+3) ; descale + psraw mm5, (PASS1_BITS+3) ; descale + psubw mm4, mm0 ; mm4=tmp5 + + packsswb mm6, mm5 ; mm6=(00 10 20 30 06 16 26 36) + packsswb mm7, mm1 ; mm7=(01 11 21 31 07 17 27 37) + + movq mm3, MMWORD [wk(0)] ; mm3=tmp2 + movq mm0, MMWORD [wk(1)] ; mm0=tmp3 + + paddw mm2, mm4 ; mm2=tmp4 + movq mm5, mm3 + movq mm1, mm0 + paddw mm3, mm4 ; mm3=data2=(02 12 22 32) + paddw mm0, mm2 ; mm0=data4=(04 14 24 34) + psraw mm3, (PASS1_BITS+3) ; descale + psraw mm0, (PASS1_BITS+3) ; descale + psubw mm5, mm4 ; mm5=data5=(05 15 25 35) + psubw mm1, mm2 ; mm1=data3=(03 13 23 33) + psraw mm5, (PASS1_BITS+3) ; descale + psraw mm1, (PASS1_BITS+3) ; descale + + movq mm4, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP] + + packsswb mm3, mm0 ; mm3=(02 12 22 32 04 14 24 34) + packsswb mm1, mm5 ; mm1=(03 13 23 33 05 15 25 35) + + paddb mm6, mm4 + paddb mm7, mm4 + paddb mm3, mm4 + paddb mm1, mm4 + + movq mm2, mm6 ; transpose coefficients(phase 1) + punpcklbw mm6, mm7 ; mm6=(00 01 10 11 20 21 30 31) + punpckhbw mm2, mm7 ; mm2=(06 07 16 17 26 27 36 37) + movq mm0, mm3 ; transpose coefficients(phase 1) + punpcklbw mm3, mm1 ; mm3=(02 03 12 13 22 23 32 33) + punpckhbw mm0, mm1 ; mm0=(04 05 14 15 24 25 34 35) + + movq mm5, mm6 ; transpose coefficients(phase 2) + punpcklwd mm6, mm3 ; mm6=(00 01 02 03 10 11 12 13) + punpckhwd mm5, mm3 ; mm5=(20 21 22 23 30 31 32 33) + movq mm4, mm0 ; transpose coefficients(phase 2) + punpcklwd mm0, mm2 ; mm0=(04 05 06 07 14 15 16 17) + punpckhwd mm4, mm2 ; mm4=(24 25 26 27 34 35 36 37) + + movq mm7, mm6 ; transpose coefficients(phase 3) + punpckldq mm6, mm0 ; mm6=(00 01 02 03 04 05 06 07) + punpckhdq mm7, mm0 ; mm7=(10 11 12 13 14 15 16 17) + movq mm1, mm5 ; transpose coefficients(phase 3) + punpckldq mm5, mm4 ; mm5=(20 21 22 23 24 25 26 27) + punpckhdq mm1, mm4 ; mm1=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_JCOEF ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jidctfst-sse2.asm b/media/libjpeg/simd/i386/jidctfst-sse2.asm new file mode 100644 index 0000000000..19704ffa48 --- /dev/null +++ b/media/libjpeg/simd/i386/jidctfst-sse2.asm @@ -0,0 +1,501 @@ +; +; jidctfst.asm - fast integer IDCT (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the inverse DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jidctfst.c; see the jidctfst.c +; for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 + +%if IFAST_SCALE_BITS != PASS1_BITS +%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." +%endif + +%if CONST_BITS == 8 +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 32 + GLOBAL_DATA(jconst_idct_ifast_sse2) + +EXTN(jconst_idct_ifast_sse2): + +PW_F1414 times 8 dw F_1_414 << CONST_SHIFT +PW_F1847 times 8 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 8 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_ifast_sse2) + +EXTN(jsimd_idct_ifast_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1, xmm0 + packsswb xmm1, xmm1 + packsswb xmm1, xmm1 + movd eax, xmm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + psubw xmm0, xmm2 ; xmm0=tmp11 + psubw xmm1, xmm3 + paddw xmm4, xmm2 ; xmm4=tmp10 + paddw xmm5, xmm3 ; xmm5=tmp13 + + psllw xmm1, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1, [GOTOFF(ebx,PW_F1414)] + psubw xmm1, xmm5 ; xmm1=tmp12 + + movdqa xmm6, xmm4 + movdqa xmm7, xmm0 + psubw xmm4, xmm5 ; xmm4=tmp3 + psubw xmm0, xmm1 ; xmm0=tmp2 + paddw xmm6, xmm5 ; xmm6=tmp0 + paddw xmm7, xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4, xmm2 + movdqa xmm0, xmm5 + psubw xmm2, xmm1 ; xmm2=z12 + psubw xmm5, xmm3 ; xmm5=z10 + paddw xmm4, xmm1 ; xmm4=z11 + paddw xmm0, xmm3 ; xmm0=z13 + + movdqa xmm1, xmm5 ; xmm1=z10(unscaled) + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3, xmm4 + psubw xmm4, xmm0 + paddw xmm3, xmm0 ; xmm3=tmp7 + + psllw xmm4, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4, [GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0, xmm5 + paddw xmm5, xmm2 + pmulhw xmm5, [GOTOFF(ebx,PW_F1847)] ; xmm5=z5 + pmulhw xmm0, [GOTOFF(ebx,PW_MF1613)] + pmulhw xmm2, [GOTOFF(ebx,PW_F1082)] + psubw xmm0, xmm1 + psubw xmm2, xmm5 ; xmm2=tmp10 + paddw xmm0, xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0, xmm3 ; xmm0=tmp6 + movdqa xmm1, xmm6 + movdqa xmm5, xmm7 + paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4, xmm0 ; xmm4=tmp5 + + movdqa xmm3, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2, xmm4 ; xmm2=tmp4 + movdqa xmm5, xmm7 + movdqa xmm0, xmm1 + paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0, xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4, xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7, xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7, xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2, xmm6 + movdqa xmm0, xmm5 + psubw xmm6, xmm1 ; xmm6=tmp11 + psubw xmm5, xmm3 + paddw xmm2, xmm1 ; xmm2=tmp10 + paddw xmm0, xmm3 ; xmm0=tmp13 + + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [GOTOFF(ebx,PW_F1414)] + psubw xmm5, xmm0 ; xmm5=tmp12 + + movdqa xmm1, xmm2 + movdqa xmm3, xmm6 + psubw xmm2, xmm0 ; xmm2=tmp3 + psubw xmm6, xmm5 ; xmm6=tmp2 + paddw xmm1, xmm0 ; xmm1=tmp0 + paddw xmm3, xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2, xmm0 + movdqa xmm6, xmm4 + psubw xmm0, xmm7 ; xmm0=z12 + psubw xmm4, xmm5 ; xmm4=z10 + paddw xmm2, xmm7 ; xmm2=z11 + paddw xmm6, xmm5 ; xmm6=z13 + + movdqa xmm7, xmm4 ; xmm7=z10(unscaled) + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + psllw xmm4, PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5, xmm2 + psubw xmm2, xmm6 + paddw xmm5, xmm6 ; xmm5=tmp7 + + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2, [GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6, xmm4 + paddw xmm4, xmm0 + pmulhw xmm4, [GOTOFF(ebx,PW_F1847)] ; xmm4=z5 + pmulhw xmm6, [GOTOFF(ebx,PW_MF1613)] + pmulhw xmm0, [GOTOFF(ebx,PW_F1082)] + psubw xmm6, xmm7 + psubw xmm0, xmm4 ; xmm0=tmp10 + paddw xmm6, xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6, xmm5 ; xmm6=tmp6 + movdqa xmm7, xmm1 + movdqa xmm4, xmm3 + paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1, (PASS1_BITS+3) ; descale + psraw xmm3, (PASS1_BITS+3) ; descale + psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7, (PASS1_BITS+3) ; descale + psraw xmm4, (PASS1_BITS+3) ; descale + psubw xmm2, xmm6 ; xmm2=tmp5 + + packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0, xmm2 ; xmm0=tmp4 + movdqa xmm4, xmm5 + movdqa xmm7, xmm6 + paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5, (PASS1_BITS+3) ; descale + psraw xmm6, (PASS1_BITS+3) ; descale + psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4, (PASS1_BITS+3) ; descale + psraw xmm7, (PASS1_BITS+3) ; descale + + movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1, xmm2 + paddb xmm3, xmm2 + paddb xmm5, xmm2 + paddb xmm7, xmm2 + + movdqa xmm0, xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6, xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4, xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2, xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3, xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7, xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jidctint-avx2.asm b/media/libjpeg/simd/i386/jidctint-avx2.asm new file mode 100644 index 0000000000..199c7df3b6 --- /dev/null +++ b/media/libjpeg/simd/i386/jidctint-avx2.asm @@ -0,0 +1,453 @@ +; +; jidctint.asm - accurate integer IDCT (AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers + +%macro dotranspose 8 + ; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71) + ; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) + ; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) + ; %8=(07 17 27 37 47 57 67 77 06 16 26 36 46 56 66 76) + + vpermq %5, %1, 0xD8 + vpermq %6, %2, 0x72 + vpermq %7, %3, 0xD8 + vpermq %8, %4, 0x72 + ; transpose coefficients(phase 1) + ; %5=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71) + ; %6=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73) + ; %7=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75) + ; %8=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77) + + vpunpcklwd %1, %5, %6 + vpunpckhwd %2, %5, %6 + vpunpcklwd %3, %7, %8 + vpunpckhwd %4, %7, %8 + ; transpose coefficients(phase 2) + ; %1=(00 02 10 12 20 22 30 32 40 42 50 52 60 62 70 72) + ; %2=(01 03 11 13 21 23 31 33 41 43 51 53 61 63 71 73) + ; %3=(04 06 14 16 24 26 34 36 44 46 54 56 64 66 74 76) + ; %4=(05 07 15 17 25 27 35 37 45 47 55 57 65 67 75 77) + + vpunpcklwd %5, %1, %2 + vpunpcklwd %6, %3, %4 + vpunpckhwd %7, %1, %2 + vpunpckhwd %8, %3, %4 + ; transpose coefficients(phase 3) + ; %5=(00 01 02 03 10 11 12 13 40 41 42 43 50 51 52 53) + ; %6=(04 05 06 07 14 15 16 17 44 45 46 47 54 55 56 57) + ; %7=(20 21 22 23 30 31 32 33 60 61 62 63 70 71 72 73) + ; %8=(24 25 26 27 34 35 36 37 64 65 66 67 74 75 76 77) + + vpunpcklqdq %1, %5, %6 + vpunpckhqdq %2, %5, %6 + vpunpcklqdq %3, %7, %8 + vpunpckhqdq %4, %7, %8 + ; transpose coefficients(phase 4) + ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) +%endmacro + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions +; %1-%4: Input/output registers +; %5-%12: Temp registers +; %9: Pass (1 or 2) + +%macro dodct 13 + ; -- Even part + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + vperm2i128 %6, %3, %3, 0x01 ; %6=in6_2 + vpunpcklwd %5, %3, %6 ; %5=in26_62L + vpunpckhwd %6, %3, %6 ; %6=in26_62H + vpmaddwd %5, %5, [GOTOFF(ebx,PW_F130_F054_MF130_F054)] ; %5=tmp3_2L + vpmaddwd %6, %6, [GOTOFF(ebx,PW_F130_F054_MF130_F054)] ; %6=tmp3_2H + + vperm2i128 %7, %1, %1, 0x01 ; %7=in4_0 + vpsignw %1, %1, [GOTOFF(ebx,PW_1_NEG1)] + vpaddw %7, %7, %1 ; %7=(in0+in4)_(in0-in4) + + vpxor %1, %1, %1 + vpunpcklwd %8, %1, %7 ; %8=tmp0_1L + vpunpckhwd %1, %1, %7 ; %1=tmp0_1H + vpsrad %8, %8, (16-CONST_BITS) ; vpsrad %8,16 & vpslld %8,CONST_BITS + vpsrad %1, %1, (16-CONST_BITS) ; vpsrad %1,16 & vpslld %1,CONST_BITS + + vpsubd %3, %8, %5 + vmovdqu %11, %3 ; %11=tmp0_1L-tmp3_2L=tmp13_12L + vpaddd %3, %8, %5 + vmovdqu %9, %3 ; %9=tmp0_1L+tmp3_2L=tmp10_11L + vpsubd %3, %1, %6 + vmovdqu %12, %3 ; %12=tmp0_1H-tmp3_2H=tmp13_12H + vpaddd %3, %1, %6 + vmovdqu %10, %3 ; %10=tmp0_1H+tmp3_2H=tmp10_11H + + ; -- Odd part + + vpaddw %1, %4, %2 ; %1=in7_5+in3_1=z3_4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + vperm2i128 %8, %1, %1, 0x01 ; %8=z4_3 + vpunpcklwd %7, %1, %8 ; %7=z34_43L + vpunpckhwd %8, %1, %8 ; %8=z34_43H + vpmaddwd %7, %7, [GOTOFF(ebx,PW_MF078_F117_F078_F117)] ; %7=z3_4L + vpmaddwd %8, %8, [GOTOFF(ebx,PW_MF078_F117_F078_F117)] ; %8=z3_4H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + vperm2i128 %2, %2, %2, 0x01 ; %2=in1_3 + vpunpcklwd %3, %4, %2 ; %3=in71_53L + vpunpckhwd %4, %4, %2 ; %4=in71_53H + + vpmaddwd %5, %3, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)] ; %5=tmp0_1L + vpmaddwd %6, %4, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)] ; %6=tmp0_1H + vpaddd %5, %5, %7 ; %5=tmp0_1L+z3_4L=tmp0_1L + vpaddd %6, %6, %8 ; %6=tmp0_1H+z3_4H=tmp0_1H + + vpmaddwd %3, %3, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)] ; %3=tmp3_2L + vpmaddwd %4, %4, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)] ; %4=tmp3_2H + vperm2i128 %7, %7, %7, 0x01 ; %7=z4_3L + vperm2i128 %8, %8, %8, 0x01 ; %8=z4_3H + vpaddd %7, %3, %7 ; %7=tmp3_2L+z4_3L=tmp3_2L + vpaddd %8, %4, %8 ; %8=tmp3_2H+z4_3H=tmp3_2H + + ; -- Final output stage + + vmovdqu %3, %9 + vmovdqu %4, %10 + + vpaddd %1, %3, %7 ; %1=tmp10_11L+tmp3_2L=data0_1L + vpaddd %2, %4, %8 ; %2=tmp10_11H+tmp3_2H=data0_1H + vpaddd %1, %1, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpaddd %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpsrad %1, %1, DESCALE_P %+ %13 + vpsrad %2, %2, DESCALE_P %+ %13 + vpackssdw %1, %1, %2 ; %1=data0_1 + + vpsubd %3, %3, %7 ; %3=tmp10_11L-tmp3_2L=data7_6L + vpsubd %4, %4, %8 ; %4=tmp10_11H-tmp3_2H=data7_6H + vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpaddd %4, %4, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpsrad %3, %3, DESCALE_P %+ %13 + vpsrad %4, %4, DESCALE_P %+ %13 + vpackssdw %4, %3, %4 ; %4=data7_6 + + vmovdqu %7, %11 + vmovdqu %8, %12 + + vpaddd %2, %7, %5 ; %7=tmp13_12L+tmp0_1L=data3_2L + vpaddd %3, %8, %6 ; %8=tmp13_12H+tmp0_1H=data3_2H + vpaddd %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpsrad %2, %2, DESCALE_P %+ %13 + vpsrad %3, %3, DESCALE_P %+ %13 + vpackssdw %2, %2, %3 ; %2=data3_2 + + vpsubd %3, %7, %5 ; %7=tmp13_12L-tmp0_1L=data4_5L + vpsubd %6, %8, %6 ; %8=tmp13_12H-tmp0_1H=data4_5H + vpaddd %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpaddd %6, %6, [GOTOFF(ebx,PD_DESCALE_P %+ %13)] + vpsrad %3, %3, DESCALE_P %+ %13 + vpsrad %6, %6, DESCALE_P %+ %13 + vpackssdw %3, %3, %6 ; %3=data4_5 +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_islow_avx2) + +EXTN(jconst_idct_islow_avx2): + +PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 + times 4 dw (F_0_541 - F_1_847), F_0_541 +PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 + times 4 dw (F_1_175 - F_0_390), F_1_175 +PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899 + times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF089_F060_MF256_F050 times 4 dw -F_0_899, (F_1_501 - F_0_899) + times 4 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1) +PB_CENTERJSAMP times 32 db CENTERJSAMPLE +PW_1_NEG1 times 8 dw 1 + times 8 dw -1 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD + ; ymmword wk[WK_NUM] +%define WK_NUM 4 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_islow_avx2) + +EXTN(jsimd_idct_islow_avx2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2 + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + vpor xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + vpor xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + vpor xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + vpor xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + vpor xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + vpor xmm1, xmm1, xmm0 + vpacksswb xmm1, xmm1, xmm1 + vpacksswb xmm1, xmm1, xmm1 + movd eax, xmm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + vpmullw xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + vpsllw xmm5, xmm5, PASS1_BITS + + vpunpcklwd xmm4, xmm5, xmm5 ; xmm4=(00 00 01 01 02 02 03 03) + vpunpckhwd xmm5, xmm5, xmm5 ; xmm5=(04 04 05 05 06 06 07 07) + vinserti128 ymm4, ymm4, xmm5, 1 + + vpshufd ymm0, ymm4, 0x00 ; ymm0=col0_4=(00 00 00 00 00 00 00 00 04 04 04 04 04 04 04 04) + vpshufd ymm1, ymm4, 0x55 ; ymm1=col1_5=(01 01 01 01 01 01 01 01 05 05 05 05 05 05 05 05) + vpshufd ymm2, ymm4, 0xAA ; ymm2=col2_6=(02 02 02 02 02 02 02 02 06 06 06 06 06 06 06 06) + vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07) + + jmp near .column_end + alignx 16, 7 +%endif +.columnDCT: + + vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,esi,SIZEOF_JCOEF)] ; ymm4=in0_1 + vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,esi,SIZEOF_JCOEF)] ; ymm5=in2_3 + vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,esi,SIZEOF_JCOEF)] ; ymm6=in4_5 + vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,esi,SIZEOF_JCOEF)] ; ymm7=in6_7 + vpmullw ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + vpmullw ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + vpmullw ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + vpmullw ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + vperm2i128 ymm0, ymm4, ymm6, 0x20 ; ymm0=in0_4 + vperm2i128 ymm1, ymm5, ymm4, 0x31 ; ymm1=in3_1 + vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6 + vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5 + + dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1 + ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6 + + dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7 + +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5 + vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1 + + dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2 + ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6 + + dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 + ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7 + + vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45 + vpacksswb ymm1, ymm2, ymm4 ; ymm1=data23_67 + vpaddb ymm0, ymm0, [GOTOFF(ebx,PB_CENTERJSAMP)] + vpaddb ymm1, ymm1, [GOTOFF(ebx,PB_CENTERJSAMP)] + + vextracti128 xmm6, ymm1, 1 ; xmm3=data67 + vextracti128 xmm4, ymm0, 1 ; xmm2=data45 + vextracti128 xmm2, ymm1, 0 ; xmm1=data23 + vextracti128 xmm0, ymm0, 0 ; xmm0=data01 + + vpshufd xmm1, xmm0, 0x4E ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + vpshufd xmm3, xmm2, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + vpshufd xmm5, xmm4, 0x4E ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + vpshufd xmm7, xmm6, 0x4E ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + vzeroupper + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm0 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 + + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov esi, JSAMPROW [edi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 + + mov edx, JSAMPROW [edi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jidctint-mmx.asm b/media/libjpeg/simd/i386/jidctint-mmx.asm new file mode 100644 index 0000000000..f15c8d34bc --- /dev/null +++ b/media/libjpeg/simd/i386/jidctint-mmx.asm @@ -0,0 +1,851 @@ +; +; jidctint.asm - accurate integer IDCT (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_islow_mmx) + +EXTN(jconst_idct_islow_mmx): + +PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541 +PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847) +PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175 +PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390) +PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899 +PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899) +PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_mmx(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 12 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_islow_mmx) + +EXTN(jsimd_idct_islow_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1, mm0 + packsswb mm1, mm1 + movd eax, mm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw mm0, PASS1_BITS + + movq mm2, mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0, mm0 ; mm0=(00 00 01 01) + punpckhwd mm2, mm2 ; mm2=(02 02 03 03) + + movq mm1, mm0 + punpckldq mm0, mm0 ; mm0=(00 00 00 00) + punpckhdq mm1, mm1 ; mm1=(01 01 01 01) + movq mm3, mm2 + punpckldq mm2, mm2 ; mm2=(02 02 02 02) + punpckhdq mm3, mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movq mm4, mm1 ; mm1=in2=z2 + movq mm5, mm1 + punpcklwd mm4, mm3 ; mm3=in6=z3 + punpckhwd mm5, mm3 + movq mm1, mm4 + movq mm3, mm5 + pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L + pmaddwd mm5, [GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H + pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L + pmaddwd mm3, [GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H + + movq mm6, mm0 + paddw mm0, mm2 ; mm0=in0+in4 + psubw mm6, mm2 ; mm6=in0-in4 + + pxor mm7, mm7 + pxor mm2, mm2 + punpcklwd mm7, mm0 ; mm7=tmp0L + punpckhwd mm2, mm0 ; mm2=tmp0H + psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + psrad mm2, (16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS + + movq mm0, mm7 + paddd mm7, mm4 ; mm7=tmp10L + psubd mm0, mm4 ; mm0=tmp13L + movq mm4, mm2 + paddd mm2, mm5 ; mm2=tmp10H + psubd mm4, mm5 ; mm4=tmp13H + + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L + movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H + movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L + movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H + + pxor mm5, mm5 + pxor mm7, mm7 + punpcklwd mm5, mm6 ; mm5=tmp1L + punpckhwd mm7, mm6 ; mm7=tmp1H + psrad mm5, (16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS + psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + + movq mm2, mm5 + paddd mm5, mm1 ; mm5=tmp11L + psubd mm2, mm1 ; mm2=tmp12L + movq mm0, mm7 + paddd mm7, mm3 ; mm7=tmp11H + psubd mm0, mm3 ; mm0=tmp12H + + movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L + movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H + movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L + movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movq mm5, mm6 + movq mm7, mm4 + paddw mm5, mm3 ; mm5=z3 + paddw mm7, mm1 ; mm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm2, mm5 + movq mm0, mm5 + punpcklwd mm2, mm7 + punpckhwd mm0, mm7 + movq mm5, mm2 + movq mm7, mm0 + pmaddwd mm2, [GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L + pmaddwd mm0, [GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H + pmaddwd mm5, [GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L + pmaddwd mm7, [GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H + + movq MMWORD [wk(10)], mm2 ; wk(10)=z3L + movq MMWORD [wk(11)], mm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movq mm2, mm3 + movq mm0, mm3 + punpcklwd mm2, mm4 + punpckhwd mm0, mm4 + movq mm3, mm2 + movq mm4, mm0 + pmaddwd mm2, [GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L + pmaddwd mm0, [GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H + pmaddwd mm3, [GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L + pmaddwd mm4, [GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H + + paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L + paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H + paddd mm3, mm5 ; mm3=tmp3L + paddd mm4, mm7 ; mm4=tmp3H + + movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L + movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H + + movq mm2, mm1 + movq mm0, mm1 + punpcklwd mm2, mm6 + punpckhwd mm0, mm6 + movq mm1, mm2 + movq mm6, mm0 + pmaddwd mm2, [GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L + pmaddwd mm0, [GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H + pmaddwd mm1, [GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L + pmaddwd mm6, [GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H + + paddd mm2, mm5 ; mm2=tmp1L + paddd mm0, mm7 ; mm0=tmp1H + paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L + paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H + + movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L + movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movq mm5, MMWORD [wk(0)] ; mm5=tmp10L + movq mm7, MMWORD [wk(1)] ; mm7=tmp10H + + movq mm2, mm5 + movq mm0, mm7 + paddd mm5, mm3 ; mm5=data0L + paddd mm7, mm4 ; mm7=data0H + psubd mm2, mm3 ; mm2=data7L + psubd mm0, mm4 ; mm0=data7H + + movq mm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1] + + paddd mm5, mm3 + paddd mm7, mm3 + psrad mm5, DESCALE_P1 + psrad mm7, DESCALE_P1 + paddd mm2, mm3 + paddd mm0, mm3 + psrad mm2, DESCALE_P1 + psrad mm0, DESCALE_P1 + + packssdw mm5, mm7 ; mm5=data0=(00 01 02 03) + packssdw mm2, mm0 ; mm2=data7=(70 71 72 73) + + movq mm4, MMWORD [wk(4)] ; mm4=tmp11L + movq mm3, MMWORD [wk(5)] ; mm3=tmp11H + + movq mm7, mm4 + movq mm0, mm3 + paddd mm4, mm1 ; mm4=data1L + paddd mm3, mm6 ; mm3=data1H + psubd mm7, mm1 ; mm7=data6L + psubd mm0, mm6 ; mm0=data6H + + movq mm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1] + + paddd mm4, mm1 + paddd mm3, mm1 + psrad mm4, DESCALE_P1 + psrad mm3, DESCALE_P1 + paddd mm7, mm1 + paddd mm0, mm1 + psrad mm7, DESCALE_P1 + psrad mm0, DESCALE_P1 + + packssdw mm4, mm3 ; mm4=data1=(10 11 12 13) + packssdw mm7, mm0 ; mm7=data6=(60 61 62 63) + + movq mm6, mm5 ; transpose coefficients(phase 1) + punpcklwd mm5, mm4 ; mm5=(00 10 01 11) + punpckhwd mm6, mm4 ; mm6=(02 12 03 13) + movq mm1, mm7 ; transpose coefficients(phase 1) + punpcklwd mm7, mm2 ; mm7=(60 70 61 71) + punpckhwd mm1, mm2 ; mm1=(62 72 63 73) + + movq mm3, MMWORD [wk(6)] ; mm3=tmp12L + movq mm0, MMWORD [wk(7)] ; mm0=tmp12H + movq mm4, MMWORD [wk(10)] ; mm4=tmp1L + movq mm2, MMWORD [wk(11)] ; mm2=tmp1H + + movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11) + movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13) + movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71) + movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73) + + movq mm5, mm3 + movq mm6, mm0 + paddd mm3, mm4 ; mm3=data2L + paddd mm0, mm2 ; mm0=data2H + psubd mm5, mm4 ; mm5=data5L + psubd mm6, mm2 ; mm6=data5H + + movq mm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1] + + paddd mm3, mm7 + paddd mm0, mm7 + psrad mm3, DESCALE_P1 + psrad mm0, DESCALE_P1 + paddd mm5, mm7 + paddd mm6, mm7 + psrad mm5, DESCALE_P1 + psrad mm6, DESCALE_P1 + + packssdw mm3, mm0 ; mm3=data2=(20 21 22 23) + packssdw mm5, mm6 ; mm5=data5=(50 51 52 53) + + movq mm1, MMWORD [wk(2)] ; mm1=tmp13L + movq mm4, MMWORD [wk(3)] ; mm4=tmp13H + movq mm2, MMWORD [wk(8)] ; mm2=tmp0L + movq mm7, MMWORD [wk(9)] ; mm7=tmp0H + + movq mm0, mm1 + movq mm6, mm4 + paddd mm1, mm2 ; mm1=data3L + paddd mm4, mm7 ; mm4=data3H + psubd mm0, mm2 ; mm0=data4L + psubd mm6, mm7 ; mm6=data4H + + movq mm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1] + + paddd mm1, mm2 + paddd mm4, mm2 + psrad mm1, DESCALE_P1 + psrad mm4, DESCALE_P1 + paddd mm0, mm2 + paddd mm6, mm2 + psrad mm0, DESCALE_P1 + psrad mm6, DESCALE_P1 + + packssdw mm1, mm4 ; mm1=data3=(30 31 32 33) + packssdw mm0, mm6 ; mm0=data4=(40 41 42 43) + + movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11) + movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13) + + movq mm4, mm3 ; transpose coefficients(phase 1) + punpcklwd mm3, mm1 ; mm3=(20 30 21 31) + punpckhwd mm4, mm1 ; mm4=(22 32 23 33) + movq mm6, mm0 ; transpose coefficients(phase 1) + punpcklwd mm0, mm5 ; mm0=(40 50 41 51) + punpckhwd mm6, mm5 ; mm6=(42 52 43 53) + + movq mm1, mm7 ; transpose coefficients(phase 2) + punpckldq mm7, mm3 ; mm7=(00 10 20 30) + punpckhdq mm1, mm3 ; mm1=(01 11 21 31) + movq mm5, mm2 ; transpose coefficients(phase 2) + punpckldq mm2, mm4 ; mm2=(02 12 22 32) + punpckhdq mm5, mm4 ; mm5=(03 13 23 33) + + movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71) + movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 + + movq mm7, mm0 ; transpose coefficients(phase 2) + punpckldq mm0, mm3 ; mm0=(40 50 60 70) + punpckhdq mm7, mm3 ; mm7=(41 51 61 71) + movq mm1, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm4 ; mm6=(42 52 62 72) + punpckhdq mm1, mm4 ; mm1=(43 53 63 73) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.rowloop: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movq mm4, mm1 ; mm1=in2=z2 + movq mm5, mm1 + punpcklwd mm4, mm3 ; mm3=in6=z3 + punpckhwd mm5, mm3 + movq mm1, mm4 + movq mm3, mm5 + pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L + pmaddwd mm5, [GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H + pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L + pmaddwd mm3, [GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H + + movq mm6, mm0 + paddw mm0, mm2 ; mm0=in0+in4 + psubw mm6, mm2 ; mm6=in0-in4 + + pxor mm7, mm7 + pxor mm2, mm2 + punpcklwd mm7, mm0 ; mm7=tmp0L + punpckhwd mm2, mm0 ; mm2=tmp0H + psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + psrad mm2, (16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS + + movq mm0, mm7 + paddd mm7, mm4 ; mm7=tmp10L + psubd mm0, mm4 ; mm0=tmp13L + movq mm4, mm2 + paddd mm2, mm5 ; mm2=tmp10H + psubd mm4, mm5 ; mm4=tmp13H + + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L + movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H + movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L + movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H + + pxor mm5, mm5 + pxor mm7, mm7 + punpcklwd mm5, mm6 ; mm5=tmp1L + punpckhwd mm7, mm6 ; mm7=tmp1H + psrad mm5, (16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS + psrad mm7, (16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + + movq mm2, mm5 + paddd mm5, mm1 ; mm5=tmp11L + psubd mm2, mm1 ; mm2=tmp12L + movq mm0, mm7 + paddd mm7, mm3 ; mm7=tmp11H + psubd mm0, mm3 ; mm0=tmp12H + + movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L + movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H + movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L + movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm5, mm6 + movq mm7, mm4 + paddw mm5, mm3 ; mm5=z3 + paddw mm7, mm1 ; mm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm2, mm5 + movq mm0, mm5 + punpcklwd mm2, mm7 + punpckhwd mm0, mm7 + movq mm5, mm2 + movq mm7, mm0 + pmaddwd mm2, [GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L + pmaddwd mm0, [GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H + pmaddwd mm5, [GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L + pmaddwd mm7, [GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H + + movq MMWORD [wk(10)], mm2 ; wk(10)=z3L + movq MMWORD [wk(11)], mm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movq mm2, mm3 + movq mm0, mm3 + punpcklwd mm2, mm4 + punpckhwd mm0, mm4 + movq mm3, mm2 + movq mm4, mm0 + pmaddwd mm2, [GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L + pmaddwd mm0, [GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H + pmaddwd mm3, [GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L + pmaddwd mm4, [GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H + + paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L + paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H + paddd mm3, mm5 ; mm3=tmp3L + paddd mm4, mm7 ; mm4=tmp3H + + movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L + movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H + + movq mm2, mm1 + movq mm0, mm1 + punpcklwd mm2, mm6 + punpckhwd mm0, mm6 + movq mm1, mm2 + movq mm6, mm0 + pmaddwd mm2, [GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L + pmaddwd mm0, [GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H + pmaddwd mm1, [GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L + pmaddwd mm6, [GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H + + paddd mm2, mm5 ; mm2=tmp1L + paddd mm0, mm7 ; mm0=tmp1H + paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L + paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H + + movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L + movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movq mm5, MMWORD [wk(0)] ; mm5=tmp10L + movq mm7, MMWORD [wk(1)] ; mm7=tmp10H + + movq mm2, mm5 + movq mm0, mm7 + paddd mm5, mm3 ; mm5=data0L + paddd mm7, mm4 ; mm7=data0H + psubd mm2, mm3 ; mm2=data7L + psubd mm0, mm4 ; mm0=data7H + + movq mm3, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2] + + paddd mm5, mm3 + paddd mm7, mm3 + psrad mm5, DESCALE_P2 + psrad mm7, DESCALE_P2 + paddd mm2, mm3 + paddd mm0, mm3 + psrad mm2, DESCALE_P2 + psrad mm0, DESCALE_P2 + + packssdw mm5, mm7 ; mm5=data0=(00 10 20 30) + packssdw mm2, mm0 ; mm2=data7=(07 17 27 37) + + movq mm4, MMWORD [wk(4)] ; mm4=tmp11L + movq mm3, MMWORD [wk(5)] ; mm3=tmp11H + + movq mm7, mm4 + movq mm0, mm3 + paddd mm4, mm1 ; mm4=data1L + paddd mm3, mm6 ; mm3=data1H + psubd mm7, mm1 ; mm7=data6L + psubd mm0, mm6 ; mm0=data6H + + movq mm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2] + + paddd mm4, mm1 + paddd mm3, mm1 + psrad mm4, DESCALE_P2 + psrad mm3, DESCALE_P2 + paddd mm7, mm1 + paddd mm0, mm1 + psrad mm7, DESCALE_P2 + psrad mm0, DESCALE_P2 + + packssdw mm4, mm3 ; mm4=data1=(01 11 21 31) + packssdw mm7, mm0 ; mm7=data6=(06 16 26 36) + + packsswb mm5, mm7 ; mm5=(00 10 20 30 06 16 26 36) + packsswb mm4, mm2 ; mm4=(01 11 21 31 07 17 27 37) + + movq mm6, MMWORD [wk(6)] ; mm6=tmp12L + movq mm1, MMWORD [wk(7)] ; mm1=tmp12H + movq mm3, MMWORD [wk(10)] ; mm3=tmp1L + movq mm0, MMWORD [wk(11)] ; mm0=tmp1H + + movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36) + movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37) + + movq mm7, mm6 + movq mm2, mm1 + paddd mm6, mm3 ; mm6=data2L + paddd mm1, mm0 ; mm1=data2H + psubd mm7, mm3 ; mm7=data5L + psubd mm2, mm0 ; mm2=data5H + + movq mm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2] + + paddd mm6, mm5 + paddd mm1, mm5 + psrad mm6, DESCALE_P2 + psrad mm1, DESCALE_P2 + paddd mm7, mm5 + paddd mm2, mm5 + psrad mm7, DESCALE_P2 + psrad mm2, DESCALE_P2 + + packssdw mm6, mm1 ; mm6=data2=(02 12 22 32) + packssdw mm7, mm2 ; mm7=data5=(05 15 25 35) + + movq mm4, MMWORD [wk(2)] ; mm4=tmp13L + movq mm3, MMWORD [wk(3)] ; mm3=tmp13H + movq mm0, MMWORD [wk(8)] ; mm0=tmp0L + movq mm5, MMWORD [wk(9)] ; mm5=tmp0H + + movq mm1, mm4 + movq mm2, mm3 + paddd mm4, mm0 ; mm4=data3L + paddd mm3, mm5 ; mm3=data3H + psubd mm1, mm0 ; mm1=data4L + psubd mm2, mm5 ; mm2=data4H + + movq mm0, [GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2] + + paddd mm4, mm0 + paddd mm3, mm0 + psrad mm4, DESCALE_P2 + psrad mm3, DESCALE_P2 + paddd mm1, mm0 + paddd mm2, mm0 + psrad mm1, DESCALE_P2 + psrad mm2, DESCALE_P2 + + movq mm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP] + + packssdw mm4, mm3 ; mm4=data3=(03 13 23 33) + packssdw mm1, mm2 ; mm1=data4=(04 14 24 34) + + movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36) + movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37) + + packsswb mm6, mm1 ; mm6=(02 12 22 32 04 14 24 34) + packsswb mm4, mm7 ; mm4=(03 13 23 33 05 15 25 35) + + paddb mm0, mm5 + paddb mm3, mm5 + paddb mm6, mm5 + paddb mm4, mm5 + + movq mm2, mm0 ; transpose coefficients(phase 1) + punpcklbw mm0, mm3 ; mm0=(00 01 10 11 20 21 30 31) + punpckhbw mm2, mm3 ; mm2=(06 07 16 17 26 27 36 37) + movq mm1, mm6 ; transpose coefficients(phase 1) + punpcklbw mm6, mm4 ; mm6=(02 03 12 13 22 23 32 33) + punpckhbw mm1, mm4 ; mm1=(04 05 14 15 24 25 34 35) + + movq mm7, mm0 ; transpose coefficients(phase 2) + punpcklwd mm0, mm6 ; mm0=(00 01 02 03 10 11 12 13) + punpckhwd mm7, mm6 ; mm7=(20 21 22 23 30 31 32 33) + movq mm5, mm1 ; transpose coefficients(phase 2) + punpcklwd mm1, mm2 ; mm1=(04 05 06 07 14 15 16 17) + punpckhwd mm5, mm2 ; mm5=(24 25 26 27 34 35 36 37) + + movq mm3, mm0 ; transpose coefficients(phase 3) + punpckldq mm0, mm1 ; mm0=(00 01 02 03 04 05 06 07) + punpckhdq mm3, mm1 ; mm3=(10 11 12 13 14 15 16 17) + movq mm4, mm7 ; transpose coefficients(phase 3) + punpckldq mm7, mm5 ; mm7=(20 21 22 23 24 25 26 27) + punpckhdq mm4, mm5 ; mm4=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_JCOEF ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jidctint-sse2.asm b/media/libjpeg/simd/i386/jidctint-sse2.asm new file mode 100644 index 0000000000..43e320189b --- /dev/null +++ b/media/libjpeg/simd/i386/jidctint-sse2.asm @@ -0,0 +1,858 @@ +; +; jidctint.asm - accurate integer IDCT (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_islow_sse2) + +EXTN(jconst_idct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847) +PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 12 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_islow_sse2) + +EXTN(jsimd_idct_islow_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1, xmm0 + packsswb xmm1, xmm1 + packsswb xmm1, xmm1 + movd eax, xmm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5, PASS1_BITS + + movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end + alignx 16, 7 +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4, xmm1 ; xmm1=in2=z2 + movdqa xmm5, xmm1 + punpcklwd xmm4, xmm3 ; xmm3=in6=z3 + punpckhwd xmm5, xmm3 + movdqa xmm1, xmm4 + movdqa xmm3, xmm5 + pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L + pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm3, [GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H + + movdqa xmm6, xmm0 + paddw xmm0, xmm2 ; xmm0=in0+in4 + psubw xmm6, xmm2 ; xmm6=in0-in4 + + pxor xmm7, xmm7 + pxor xmm2, xmm2 + punpcklwd xmm7, xmm0 ; xmm7=tmp0L + punpckhwd xmm2, xmm0 ; xmm2=tmp0H + psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0, xmm7 + paddd xmm7, xmm4 ; xmm7=tmp10L + psubd xmm0, xmm4 ; xmm0=tmp13L + movdqa xmm4, xmm2 + paddd xmm2, xmm5 ; xmm2=tmp10H + psubd xmm4, xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5, xmm5 + pxor xmm7, xmm7 + punpcklwd xmm5, xmm6 ; xmm5=tmp1L + punpckhwd xmm7, xmm6 ; xmm7=tmp1H + psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2, xmm5 + paddd xmm5, xmm1 ; xmm5=tmp11L + psubd xmm2, xmm1 ; xmm2=tmp12L + movdqa xmm0, xmm7 + paddd xmm7, xmm3 ; xmm7=tmp11H + psubd xmm0, xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5, xmm6 + movdqa xmm7, xmm4 + paddw xmm5, xmm3 ; xmm5=z3 + paddw xmm7, xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2, xmm5 + movdqa xmm0, xmm5 + punpcklwd xmm2, xmm7 + punpckhwd xmm0, xmm7 + movdqa xmm5, xmm2 + movdqa xmm7, xmm0 + pmaddwd xmm2, [GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L + pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H + pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm7, [GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2, xmm3 + movdqa xmm0, xmm3 + punpcklwd xmm2, xmm4 + punpckhwd xmm0, xmm4 + movdqa xmm3, xmm2 + movdqa xmm4, xmm0 + pmaddwd xmm2, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L + pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H + pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L + pmaddwd xmm4, [GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3, xmm5 ; xmm3=tmp3L + paddd xmm4, xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2, xmm1 + movdqa xmm0, xmm1 + punpcklwd xmm2, xmm6 + punpckhwd xmm0, xmm6 + movdqa xmm1, xmm2 + movdqa xmm6, xmm0 + pmaddwd xmm2, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L + pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H + pmaddwd xmm1, [GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L + pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm2, xmm5 ; xmm2=tmp1L + paddd xmm0, xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2, xmm5 + movdqa xmm0, xmm7 + paddd xmm5, xmm3 ; xmm5=data0L + paddd xmm7, xmm4 ; xmm7=data0H + psubd xmm2, xmm3 ; xmm2=data7L + psubd xmm0, xmm4 ; xmm0=data7H + + movdqa xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] + + paddd xmm5, xmm3 + paddd xmm7, xmm3 + psrad xmm5, DESCALE_P1 + psrad xmm7, DESCALE_P1 + paddd xmm2, xmm3 + paddd xmm0, xmm3 + psrad xmm2, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7, xmm4 + movdqa xmm0, xmm3 + paddd xmm4, xmm1 ; xmm4=data1L + paddd xmm3, xmm6 ; xmm3=data1H + psubd xmm7, xmm1 ; xmm7=data6L + psubd xmm0, xmm6 ; xmm0=data6H + + movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] + + paddd xmm4, xmm1 + paddd xmm3, xmm1 + psrad xmm4, DESCALE_P1 + psrad xmm3, DESCALE_P1 + paddd xmm7, xmm1 + paddd xmm0, xmm1 + psrad xmm7, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5, xmm3 + movdqa xmm6, xmm0 + paddd xmm3, xmm4 ; xmm3=data2L + paddd xmm0, xmm2 ; xmm0=data2H + psubd xmm5, xmm4 ; xmm5=data5L + psubd xmm6, xmm2 ; xmm6=data5H + + movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] + + paddd xmm3, xmm7 + paddd xmm0, xmm7 + psrad xmm3, DESCALE_P1 + psrad xmm0, DESCALE_P1 + paddd xmm5, xmm7 + paddd xmm6, xmm7 + psrad xmm5, DESCALE_P1 + psrad xmm6, DESCALE_P1 + + packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0, xmm1 + movdqa xmm6, xmm4 + paddd xmm1, xmm2 ; xmm1=data3L + paddd xmm4, xmm7 ; xmm4=data3H + psubd xmm0, xmm2 ; xmm0=data4L + psubd xmm6, xmm7 ; xmm6=data4H + + movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] + + paddd xmm1, xmm2 + paddd xmm4, xmm2 + psrad xmm1, DESCALE_P1 + psrad xmm4, DESCALE_P1 + paddd xmm0, xmm2 + paddd xmm6, xmm2 + psrad xmm0, DESCALE_P1 + psrad xmm6, DESCALE_P1 + + packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4, xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1, xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6, xmm1 ; xmm1=in2=z2 + movdqa xmm5, xmm1 + punpcklwd xmm6, xmm2 ; xmm2=in6=z3 + punpckhwd xmm5, xmm2 + movdqa xmm1, xmm6 + movdqa xmm2, xmm5 + pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L + pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm2, [GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H + + movdqa xmm3, xmm7 + paddw xmm7, xmm0 ; xmm7=in0+in4 + psubw xmm3, xmm0 ; xmm3=in0-in4 + + pxor xmm4, xmm4 + pxor xmm0, xmm0 + punpcklwd xmm4, xmm7 ; xmm4=tmp0L + punpckhwd xmm0, xmm7 ; xmm0=tmp0H + psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7, xmm4 + paddd xmm4, xmm6 ; xmm4=tmp10L + psubd xmm7, xmm6 ; xmm7=tmp13L + movdqa xmm6, xmm0 + paddd xmm0, xmm5 ; xmm0=tmp10H + psubd xmm6, xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5, xmm5 + pxor xmm4, xmm4 + punpcklwd xmm5, xmm3 ; xmm5=tmp1L + punpckhwd xmm4, xmm3 ; xmm4=tmp1H + psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0, xmm5 + paddd xmm5, xmm1 ; xmm5=tmp11L + psubd xmm0, xmm1 ; xmm0=tmp12L + movdqa xmm7, xmm4 + paddd xmm4, xmm2 ; xmm4=tmp11H + psubd xmm7, xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5, xmm6 + movdqa xmm4, xmm3 + paddw xmm5, xmm1 ; xmm5=z3 + paddw xmm4, xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0, xmm5 + movdqa xmm7, xmm5 + punpcklwd xmm0, xmm4 + punpckhwd xmm7, xmm4 + movdqa xmm5, xmm0 + movdqa xmm4, xmm7 + pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L + pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H + pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm4, [GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0, xmm1 + movdqa xmm7, xmm1 + punpcklwd xmm0, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm1, xmm0 + movdqa xmm3, xmm7 + pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L + pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H + pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L + pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1, xmm5 ; xmm1=tmp3L + paddd xmm3, xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0, xmm2 + movdqa xmm7, xmm2 + punpcklwd xmm0, xmm6 + punpckhwd xmm7, xmm6 + movdqa xmm2, xmm0 + movdqa xmm6, xmm7 + pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L + pmaddwd xmm7, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H + pmaddwd xmm2, [GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L + pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm0, xmm5 ; xmm0=tmp1L + paddd xmm7, xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0, xmm5 + movdqa xmm7, xmm4 + paddd xmm5, xmm1 ; xmm5=data0L + paddd xmm4, xmm3 ; xmm4=data0H + psubd xmm0, xmm1 ; xmm0=data7L + psubd xmm7, xmm3 ; xmm7=data7H + + movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] + + paddd xmm5, xmm1 + paddd xmm4, xmm1 + psrad xmm5, DESCALE_P2 + psrad xmm4, DESCALE_P2 + paddd xmm0, xmm1 + paddd xmm7, xmm1 + psrad xmm0, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4, xmm3 + movdqa xmm7, xmm1 + paddd xmm3, xmm2 ; xmm3=data1L + paddd xmm1, xmm6 ; xmm1=data1H + psubd xmm4, xmm2 ; xmm4=data6L + psubd xmm7, xmm6 ; xmm7=data6H + + movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] + + paddd xmm3, xmm2 + paddd xmm1, xmm2 + psrad xmm3, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm4, xmm2 + paddd xmm7, xmm2 + psrad xmm4, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4, xmm6 + movdqa xmm0, xmm2 + paddd xmm6, xmm1 ; xmm6=data2L + paddd xmm2, xmm7 ; xmm2=data2H + psubd xmm4, xmm1 ; xmm4=data5L + psubd xmm0, xmm7 ; xmm0=data5H + + movdqa xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] + + paddd xmm6, xmm5 + paddd xmm2, xmm5 + psrad xmm6, DESCALE_P2 + psrad xmm2, DESCALE_P2 + paddd xmm4, xmm5 + paddd xmm0, xmm5 + psrad xmm4, DESCALE_P2 + psrad xmm0, DESCALE_P2 + + packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2, xmm3 + movdqa xmm0, xmm1 + paddd xmm3, xmm7 ; xmm3=data3L + paddd xmm1, xmm5 ; xmm1=data3H + psubd xmm2, xmm7 ; xmm2=data4L + psubd xmm0, xmm5 ; xmm0=data4H + + movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] + + paddd xmm3, xmm7 + paddd xmm1, xmm7 + psrad xmm3, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm2, xmm7 + paddd xmm0, xmm7 + psrad xmm2, DESCALE_P2 + psrad xmm0, DESCALE_P2 + + movdqa xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] + + packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7, xmm5 + paddb xmm1, xmm5 + paddb xmm6, xmm5 + paddb xmm3, xmm5 + + movdqa xmm0, xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4, xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5, xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3, xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jidctred-mmx.asm b/media/libjpeg/simd/i386/jidctred-mmx.asm new file mode 100644 index 0000000000..e2307e1cb6 --- /dev/null +++ b/media/libjpeg/simd/i386/jidctred-mmx.asm @@ -0,0 +1,704 @@ +; +; jidctred.asm - reduced-size IDCT (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains inverse-DCT routines that produce reduced-size +; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. +; The following code is based directly on the IJG's original jidctred.c; +; see the jidctred.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1) +%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1) +%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2) +%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2) + +%if CONST_BITS == 13 +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_red_mmx) + +EXTN(jconst_idct_red_mmx): + +PW_F184_MF076 times 2 dw F_1_847, -F_0_765 +PW_F256_F089 times 2 dw F_2_562, F_0_899 +PW_F106_MF217 times 2 dw F_1_061, -F_2_172 +PW_MF060_MF050 times 2 dw -F_0_601, -F_0_509 +PW_F145_MF021 times 2 dw F_1_451, -F_0_211 +PW_F362_MF127 times 2 dw F_3_624, -F_1_272 +PW_F085_MF072 times 2 dw F_0_850, -F_0_720 +PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4 - 1) +PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4 - 1) +PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2 - 1) +PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2 - 1) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 4x4 output block. +; +; GLOBAL(void) +; jsimd_idct_4x4_mmx(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD + ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_4x4_mmx) + +EXTN(jsimd_idct_4x4_mmx): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm0, mm1 + packsswb mm0, mm0 + movd eax, mm0 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw mm0, PASS1_BITS + + movq mm2, mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0, mm0 ; mm0=(00 00 01 01) + punpckhwd mm2, mm2 ; mm2=(02 02 03 03) + + movq mm1, mm0 + punpckldq mm0, mm0 ; mm0=(00 00 00 00) + punpckhdq mm1, mm1 ; mm1=(01 01 01 01) + movq mm3, mm2 + punpckldq mm2, mm2 ; mm2=(02 02 02 02) + punpckhdq mm3, mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16, 7 +%endif +.columnDCT: + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movq mm4, mm0 + movq mm5, mm0 + punpcklwd mm4, mm1 + punpckhwd mm5, mm1 + movq mm0, mm4 + movq mm1, mm5 + pmaddwd mm4, [GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) + pmaddwd mm5, [GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) + pmaddwd mm0, [GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) + pmaddwd mm1, [GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) + + movq mm6, mm2 + movq mm7, mm2 + punpcklwd mm6, mm3 + punpckhwd mm7, mm3 + movq mm2, mm6 + movq mm3, mm7 + pmaddwd mm6, [GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) + pmaddwd mm7, [GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) + pmaddwd mm2, [GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) + pmaddwd mm3, [GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) + + paddd mm6, mm4 ; mm6=tmp2L + paddd mm7, mm5 ; mm7=tmp2H + paddd mm2, mm0 ; mm2=tmp0L + paddd mm3, mm1 ; mm3=tmp0H + + movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L + movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H + + ; -- Even part + + movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor mm1, mm1 + pxor mm2, mm2 + punpcklwd mm1, mm4 ; mm1=tmp0L + punpckhwd mm2, mm4 ; mm2=tmp0H + psrad mm1, (16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 + psrad mm2, (16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 + + movq mm3, mm5 ; mm5=in2=z2 + punpcklwd mm5, mm0 ; mm0=in6=z3 + punpckhwd mm3, mm0 + pmaddwd mm5, [GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L + pmaddwd mm3, [GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H + + movq mm4, mm1 + movq mm0, mm2 + paddd mm1, mm5 ; mm1=tmp10L + paddd mm2, mm3 ; mm2=tmp10H + psubd mm4, mm5 ; mm4=tmp12L + psubd mm0, mm3 ; mm0=tmp12H + + ; -- Final output stage + + movq mm5, mm1 + movq mm3, mm2 + paddd mm1, mm6 ; mm1=data0L + paddd mm2, mm7 ; mm2=data0H + psubd mm5, mm6 ; mm5=data3L + psubd mm3, mm7 ; mm3=data3H + + movq mm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4] + + paddd mm1, mm6 + paddd mm2, mm6 + psrad mm1, DESCALE_P1_4 + psrad mm2, DESCALE_P1_4 + paddd mm5, mm6 + paddd mm3, mm6 + psrad mm5, DESCALE_P1_4 + psrad mm3, DESCALE_P1_4 + + packssdw mm1, mm2 ; mm1=data0=(00 01 02 03) + packssdw mm5, mm3 ; mm5=data3=(30 31 32 33) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp0L + movq mm6, MMWORD [wk(1)] ; mm6=tmp0H + + movq mm2, mm4 + movq mm3, mm0 + paddd mm4, mm7 ; mm4=data1L + paddd mm0, mm6 ; mm0=data1H + psubd mm2, mm7 ; mm2=data2L + psubd mm3, mm6 ; mm3=data2H + + movq mm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4] + + paddd mm4, mm7 + paddd mm0, mm7 + psrad mm4, DESCALE_P1_4 + psrad mm0, DESCALE_P1_4 + paddd mm2, mm7 + paddd mm3, mm7 + psrad mm2, DESCALE_P1_4 + psrad mm3, DESCALE_P1_4 + + packssdw mm4, mm0 ; mm4=data1=(10 11 12 13) + packssdw mm2, mm3 ; mm2=data2=(20 21 22 23) + + movq mm6, mm1 ; transpose coefficients(phase 1) + punpcklwd mm1, mm4 ; mm1=(00 10 01 11) + punpckhwd mm6, mm4 ; mm6=(02 12 03 13) + movq mm7, mm2 ; transpose coefficients(phase 1) + punpcklwd mm2, mm5 ; mm2=(20 30 21 31) + punpckhwd mm7, mm5 ; mm7=(22 32 23 33) + + movq mm0, mm1 ; transpose coefficients(phase 2) + punpckldq mm1, mm2 ; mm1=(00 10 20 30) + punpckhdq mm0, mm2 ; mm0=(01 11 21 31) + movq mm3, mm6 ; transpose coefficients(phase 2) + punpckldq mm6, mm7 ; mm6=(02 12 22 32) + punpckhdq mm3, mm7 ; mm3=(03 13 23 33) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm4, mm0 + movq mm5, mm0 + punpcklwd mm4, mm1 + punpckhwd mm5, mm1 + movq mm0, mm4 + movq mm1, mm5 + pmaddwd mm4, [GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) + pmaddwd mm5, [GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) + pmaddwd mm0, [GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) + pmaddwd mm1, [GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) + + movq mm6, mm2 + movq mm7, mm2 + punpcklwd mm6, mm3 + punpckhwd mm7, mm3 + movq mm2, mm6 + movq mm3, mm7 + pmaddwd mm6, [GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) + pmaddwd mm7, [GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) + pmaddwd mm2, [GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) + pmaddwd mm3, [GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) + + paddd mm6, mm4 ; mm6=tmp2L + paddd mm7, mm5 ; mm7=tmp2H + paddd mm2, mm0 ; mm2=tmp0L + paddd mm3, mm1 ; mm3=tmp0H + + movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L + movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H + + ; -- Even part + + movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + pxor mm1, mm1 + pxor mm2, mm2 + punpcklwd mm1, mm4 ; mm1=tmp0L + punpckhwd mm2, mm4 ; mm2=tmp0H + psrad mm1, (16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 + psrad mm2, (16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 + + movq mm3, mm5 ; mm5=in2=z2 + punpcklwd mm5, mm0 ; mm0=in6=z3 + punpckhwd mm3, mm0 + pmaddwd mm5, [GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L + pmaddwd mm3, [GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H + + movq mm4, mm1 + movq mm0, mm2 + paddd mm1, mm5 ; mm1=tmp10L + paddd mm2, mm3 ; mm2=tmp10H + psubd mm4, mm5 ; mm4=tmp12L + psubd mm0, mm3 ; mm0=tmp12H + + ; -- Final output stage + + movq mm5, mm1 + movq mm3, mm2 + paddd mm1, mm6 ; mm1=data0L + paddd mm2, mm7 ; mm2=data0H + psubd mm5, mm6 ; mm5=data3L + psubd mm3, mm7 ; mm3=data3H + + movq mm6, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4] + + paddd mm1, mm6 + paddd mm2, mm6 + psrad mm1, DESCALE_P2_4 + psrad mm2, DESCALE_P2_4 + paddd mm5, mm6 + paddd mm3, mm6 + psrad mm5, DESCALE_P2_4 + psrad mm3, DESCALE_P2_4 + + packssdw mm1, mm2 ; mm1=data0=(00 10 20 30) + packssdw mm5, mm3 ; mm5=data3=(03 13 23 33) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp0L + movq mm6, MMWORD [wk(1)] ; mm6=tmp0H + + movq mm2, mm4 + movq mm3, mm0 + paddd mm4, mm7 ; mm4=data1L + paddd mm0, mm6 ; mm0=data1H + psubd mm2, mm7 ; mm2=data2L + psubd mm3, mm6 ; mm3=data2H + + movq mm7, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4] + + paddd mm4, mm7 + paddd mm0, mm7 + psrad mm4, DESCALE_P2_4 + psrad mm0, DESCALE_P2_4 + paddd mm2, mm7 + paddd mm3, mm7 + psrad mm2, DESCALE_P2_4 + psrad mm3, DESCALE_P2_4 + + packssdw mm4, mm0 ; mm4=data1=(01 11 21 31) + packssdw mm2, mm3 ; mm2=data2=(02 12 22 32) + + movq mm6, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] + + packsswb mm1, mm2 ; mm1=(00 10 20 30 02 12 22 32) + packsswb mm4, mm5 ; mm4=(01 11 21 31 03 13 23 33) + paddb mm1, mm6 + paddb mm4, mm6 + + movq mm7, mm1 ; transpose coefficients(phase 1) + punpcklbw mm1, mm4 ; mm1=(00 01 10 11 20 21 30 31) + punpckhbw mm7, mm4 ; mm7=(02 03 12 13 22 23 32 33) + + movq mm0, mm1 ; transpose coefficients(phase 2) + punpcklwd mm1, mm7 ; mm1=(00 01 02 03 10 11 12 13) + punpckhwd mm0, mm7 ; mm0=(20 21 22 23 30 31 32 33) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movd dword [edx+eax*SIZEOF_JSAMPLE], mm1 + movd dword [esi+eax*SIZEOF_JSAMPLE], mm0 + + psrlq mm1, 4*BYTE_BIT + psrlq mm0, 4*BYTE_BIT + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movd dword [edx+eax*SIZEOF_JSAMPLE], mm1 + movd dword [esi+eax*SIZEOF_JSAMPLE], mm0 + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 2x2 output block. +; +; GLOBAL(void) +; jsimd_idct_2x2_mmx(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + + align 32 + GLOBAL_FUNCTION(jsimd_idct_2x2_mmx) + +EXTN(jsimd_idct_2x2_mmx): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + mov edx, POINTER [dct_table(ebp)] ; quantptr + mov esi, JCOEFPTR [coef_block(ebp)] ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm0=(10 11 ** 13), mm1=(30 31 ** 33) + ; mm2=(50 51 ** 53), mm3=(70 71 ** 73) + + pcmpeqd mm7, mm7 + pslld mm7, WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF} + + movq mm4, mm0 ; mm4=(10 11 ** 13) + movq mm5, mm2 ; mm5=(50 51 ** 53) + punpcklwd mm4, mm1 ; mm4=(10 30 11 31) + punpcklwd mm5, mm3 ; mm5=(50 70 51 71) + pmaddwd mm4, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm5, [GOTOFF(ebx,PW_F085_MF072)] + + psrld mm0, WORD_BIT ; mm0=(11 -- 13 --) + pand mm1, mm7 ; mm1=(-- 31 -- 33) + psrld mm2, WORD_BIT ; mm2=(51 -- 53 --) + pand mm3, mm7 ; mm3=(-- 71 -- 73) + por mm0, mm1 ; mm0=(11 31 13 33) + por mm2, mm3 ; mm2=(51 71 53 73) + pmaddwd mm0, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm2, [GOTOFF(ebx,PW_F085_MF072)] + + paddd mm4, mm5 ; mm4=tmp0[col0 col1] + + movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)] + pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)] + pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm6=(** 15 ** 17), mm1=(** 35 ** 37) + ; mm3=(** 55 ** 57), mm5=(** 75 ** 77) + + psrld mm6, WORD_BIT ; mm6=(15 -- 17 --) + pand mm1, mm7 ; mm1=(-- 35 -- 37) + psrld mm3, WORD_BIT ; mm3=(55 -- 57 --) + pand mm5, mm7 ; mm5=(-- 75 -- 77) + por mm6, mm1 ; mm6=(15 35 17 37) + por mm3, mm5 ; mm3=(55 75 57 77) + pmaddwd mm6, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm3, [GOTOFF(ebx,PW_F085_MF072)] + + paddd mm0, mm2 ; mm0=tmp0[col1 col3] + paddd mm6, mm3 ; mm6=tmp0[col5 col7] + + ; -- Even part + + movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)] + pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm1=(00 01 ** 03), mm5=(** 05 ** 07) + + movq mm2, mm1 ; mm2=(00 01 ** 03) + pslld mm1, WORD_BIT ; mm1=(-- 00 -- **) + psrad mm1, (WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****] + + pand mm2, mm7 ; mm2=(-- 01 -- 03) + pand mm5, mm7 ; mm5=(-- 05 -- 07) + psrad mm2, (WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3] + psrad mm5, (WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7] + + ; -- Final output stage + + movq mm3, mm1 + paddd mm1, mm4 ; mm1=data0[col0 ****]=(A0 **) + psubd mm3, mm4 ; mm3=data1[col0 ****]=(B0 **) + punpckldq mm1, mm3 ; mm1=(A0 B0) + + movq mm7, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2] + + movq mm4, mm2 + movq mm3, mm5 + paddd mm2, mm0 ; mm2=data0[col1 col3]=(A1 A3) + paddd mm5, mm6 ; mm5=data0[col5 col7]=(A5 A7) + psubd mm4, mm0 ; mm4=data1[col1 col3]=(B1 B3) + psubd mm3, mm6 ; mm3=data1[col5 col7]=(B5 B7) + + paddd mm1, mm7 + psrad mm1, DESCALE_P1_2 + + paddd mm2, mm7 + paddd mm5, mm7 + psrad mm2, DESCALE_P1_2 + psrad mm5, DESCALE_P1_2 + paddd mm4, mm7 + paddd mm3, mm7 + psrad mm4, DESCALE_P1_2 + psrad mm3, DESCALE_P1_2 + + ; ---- Pass 2: process rows, store into output array. + + mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(ebp)] + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw mm2, mm4 ; mm2=(A1 A3 B1 B3) + packssdw mm5, mm3 ; mm5=(A5 A7 B5 B7) + pmaddwd mm2, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm5, [GOTOFF(ebx,PW_F085_MF072)] + + paddd mm2, mm5 ; mm2=tmp0[row0 row1] + + ; -- Even part + + pslld mm1, (CONST_BITS+2) ; mm1=tmp10[row0 row1] + + ; -- Final output stage + + movq mm0, [GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2] + + movq mm6, mm1 + paddd mm1, mm2 ; mm1=data0[row0 row1]=(C0 C1) + psubd mm6, mm2 ; mm6=data1[row0 row1]=(D0 D1) + + paddd mm1, mm0 + paddd mm6, mm0 + psrad mm1, DESCALE_P2_2 + psrad mm6, DESCALE_P2_2 + + movq mm7, mm1 ; transpose coefficients + punpckldq mm1, mm6 ; mm1=(C0 D0) + punpckhdq mm7, mm6 ; mm7=(C1 D1) + + packssdw mm1, mm7 ; mm1=(C0 D0 C1 D1) + packsswb mm1, mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1) + paddb mm1, [GOTOFF(ebx,PB_CENTERJSAMP)] + + movd ecx, mm1 + movd ebx, mm1 ; ebx=(C0 D0 C1 D1) + shr ecx, 2*BYTE_BIT ; ecx=(C1 D1 -- --) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov word [edx+eax*SIZEOF_JSAMPLE], bx + mov word [esi+eax*SIZEOF_JSAMPLE], cx + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jidctred-sse2.asm b/media/libjpeg/simd/i386/jidctred-sse2.asm new file mode 100644 index 0000000000..6e56494e97 --- /dev/null +++ b/media/libjpeg/simd/i386/jidctred-sse2.asm @@ -0,0 +1,592 @@ +; +; jidctred.asm - reduced-size IDCT (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains inverse-DCT routines that produce reduced-size +; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. +; The following code is based directly on the IJG's original jidctred.c; +; see the jidctred.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1) +%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1) +%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2) +%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2) + +%if CONST_BITS == 13 +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_red_sse2) + +EXTN(jconst_idct_red_sse2): + +PW_F184_MF076 times 4 dw F_1_847, -F_0_765 +PW_F256_F089 times 4 dw F_2_562, F_0_899 +PW_F106_MF217 times 4 dw F_1_061, -F_2_172 +PW_MF060_MF050 times 4 dw -F_0_601, -F_0_509 +PW_F145_MF021 times 4 dw F_1_451, -F_0_211 +PW_F362_MF127 times 4 dw F_3_624, -F_1_272 +PW_F085_MF072 times 4 dw F_0_850, -F_0_720 +PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4 - 1) +PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4 - 1) +PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1) +PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 4x4 output block. +; +; GLOBAL(void) +; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + +%define original_ebp ebp + 0 +%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_4x4_sse2) + +EXTN(jsimd_idct_4x4_sse2): + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm0, xmm1 + packsswb xmm0, xmm0 + packsswb xmm0, xmm0 + movd eax, xmm0 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0, PASS1_BITS + + movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end + alignx 16, 7 +%endif +.columnDCT: + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4, xmm0 + movdqa xmm5, xmm0 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + pmaddwd xmm4, [GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) + pmaddwd xmm5, [GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) + pmaddwd xmm0, [GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) + pmaddwd xmm1, [GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) + + movdqa xmm6, xmm2 + movdqa xmm7, xmm2 + punpcklwd xmm6, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm2, xmm6 + movdqa xmm3, xmm7 + pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) + pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) + pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) + pmaddwd xmm3, [GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) + + paddd xmm6, xmm4 ; xmm6=tmp2L + paddd xmm7, xmm5 ; xmm7=tmp2H + paddd xmm2, xmm0 ; xmm2=tmp0L + paddd xmm3, xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1, xmm1 + pxor xmm2, xmm2 + punpcklwd xmm1, xmm4 ; xmm1=tmp0L + punpckhwd xmm2, xmm4 ; xmm2=tmp0H + psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3, xmm5 ; xmm5=in2=z2 + punpcklwd xmm5, xmm0 ; xmm0=in6=z3 + punpckhwd xmm3, xmm0 + pmaddwd xmm5, [GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L + pmaddwd xmm3, [GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H + + movdqa xmm4, xmm1 + movdqa xmm0, xmm2 + paddd xmm1, xmm5 ; xmm1=tmp10L + paddd xmm2, xmm3 ; xmm2=tmp10H + psubd xmm4, xmm5 ; xmm4=tmp12L + psubd xmm0, xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5, xmm1 + movdqa xmm3, xmm2 + paddd xmm1, xmm6 ; xmm1=data0L + paddd xmm2, xmm7 ; xmm2=data0H + psubd xmm5, xmm6 ; xmm5=data3L + psubd xmm3, xmm7 ; xmm3=data3H + + movdqa xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] + + paddd xmm1, xmm6 + paddd xmm2, xmm6 + psrad xmm1, DESCALE_P1_4 + psrad xmm2, DESCALE_P1_4 + paddd xmm5, xmm6 + paddd xmm3, xmm6 + psrad xmm5, DESCALE_P1_4 + psrad xmm3, DESCALE_P1_4 + + packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2, xmm4 + movdqa xmm3, xmm0 + paddd xmm4, xmm7 ; xmm4=data1L + paddd xmm0, xmm6 ; xmm0=data1H + psubd xmm2, xmm7 ; xmm2=data2L + psubd xmm3, xmm6 ; xmm3=data2H + + movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] + + paddd xmm4, xmm7 + paddd xmm0, xmm7 + psrad xmm4, DESCALE_P1_4 + psrad xmm0, DESCALE_P1_4 + paddd xmm2, xmm7 + paddd xmm3, xmm7 + psrad xmm2, DESCALE_P1_4 + psrad xmm3, DESCALE_P1_4 + + packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + pxor xmm4, xmm4 + punpcklwd xmm4, xmm1 ; xmm4=tmp0 + psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + + ; -- Odd part + + punpckhwd xmm1, xmm0 + punpckhwd xmm6, xmm3 + movdqa xmm5, xmm1 + movdqa xmm2, xmm6 + pmaddwd xmm1, [GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) + pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) + pmaddwd xmm5, [GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) + pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) + + paddd xmm6, xmm1 ; xmm6=tmp2 + paddd xmm2, xmm5 ; xmm2=tmp0 + + ; -- Even part + + punpcklwd xmm0, xmm3 + pmaddwd xmm0, [GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 + + movdqa xmm7, xmm4 + paddd xmm4, xmm0 ; xmm4=tmp10 + psubd xmm7, xmm0 ; xmm7=tmp12 + + ; -- Final output stage + + movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] + + movdqa xmm5, xmm4 + movdqa xmm3, xmm7 + paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32) + + paddd xmm4, xmm1 + paddd xmm7, xmm1 + psrad xmm4, DESCALE_P2_4 + psrad xmm7, DESCALE_P2_4 + paddd xmm5, xmm1 + paddd xmm3, xmm1 + psrad xmm5, DESCALE_P2_4 + psrad xmm3, DESCALE_P2_4 + + packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + + movdqa xmm0, xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + + movdqa xmm6, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + + packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)] + + pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 + movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 2x2 output block. +; +; GLOBAL(void) +; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b) + 8 ; void *dct_table +%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block +%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf +%define output_col(b) (b) + 20 ; JDIMENSION output_col + + align 32 + GLOBAL_FUNCTION(jsimd_idct_2x2_sse2) + +EXTN(jsimd_idct_2x2_sse2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + mov edx, POINTER [dct_table(ebp)] ; quantptr + mov esi, JCOEFPTR [coef_block(ebp)] ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + + pcmpeqd xmm7, xmm7 + pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + + movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm5, [GOTOFF(ebx,PW_F085_MF072)] + + psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm2, [GOTOFF(ebx,PW_F085_MF072)] + + paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + + ; -- Even part + + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm6=(00 01 ** 03 ** 05 ** 07) + + movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + + ; -- Final output stage + + movdqa xmm3, xmm6 + movdqa xmm5, xmm1 + paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + + movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] + + punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **) + + movdqa xmm7, xmm1 + punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7) + + paddd xmm6, xmm2 + psrad xmm6, DESCALE_P1_2 + + paddd xmm1, xmm2 + paddd xmm7, xmm2 + psrad xmm1, DESCALE_P1_2 + psrad xmm7, DESCALE_P1_2 + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(ebp)] + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm7, [GOTOFF(ebx,PW_F085_MF072)] + + paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + + ; -- Even part + + pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + + ; -- Final output stage + + movdqa xmm4, xmm6 + paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + + punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1) + + paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)] + psrad xmm6, DESCALE_P2_2 + + packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)] + + pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --) + pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov word [edx+eax*SIZEOF_JSAMPLE], bx + mov word [esi+eax*SIZEOF_JSAMPLE], cx + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jquant-3dn.asm b/media/libjpeg/simd/i386/jquant-3dn.asm new file mode 100644 index 0000000000..5cb60caa94 --- /dev/null +++ b/media/libjpeg/simd/i386/jquant-3dn.asm @@ -0,0 +1,230 @@ +; +; jquant.asm - sample data conversion and quantization (3DNow! & MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +%define sample_data ebp + 8 ; JSAMPARRAY sample_data +%define start_col ebp + 12 ; JDIMENSION start_col +%define workspace ebp + 16 ; FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow) + +EXTN(jsimd_convsamp_float_3dnow): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw mm7, mm7 + psllw mm7, 7 + packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16, 7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb mm0, mm7 ; mm0=(01234567) + psubb mm1, mm7 ; mm1=(89ABCDEF) + + punpcklbw mm2, mm0 ; mm2=(*0*1*2*3) + punpckhbw mm0, mm0 ; mm0=(*4*5*6*7) + punpcklbw mm3, mm1 ; mm3=(*8*9*A*B) + punpckhbw mm1, mm1 ; mm1=(*C*D*E*F) + + punpcklwd mm4, mm2 ; mm4=(***0***1) + punpckhwd mm2, mm2 ; mm2=(***2***3) + punpcklwd mm5, mm0 ; mm5=(***4***5) + punpckhwd mm0, mm0 ; mm0=(***6***7) + + psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01) + psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23) + pi2fd mm4, mm4 + pi2fd mm2, mm2 + psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45) + psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67) + pi2fd mm5, mm5 + pi2fd mm0, mm0 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + + punpcklwd mm6, mm3 ; mm6=(***8***9) + punpckhwd mm3, mm3 ; mm3=(***A***B) + punpcklwd mm4, mm1 ; mm4=(***C***D) + punpckhwd mm1, mm1 ; mm1=(***E***F) + + psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89) + psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB) + pi2fd mm6, mm6 + pi2fd mm3, mm3 + psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD) + psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF) + pi2fd mm4, mm4 + pi2fd mm1, mm1 + + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .convloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +%define coef_block ebp + 8 ; JCOEFPTR coef_block +%define divisors ebp + 12 ; FAST_FLOAT *divisors +%define workspace ebp + 16 ; FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_float_3dnow) + +EXTN(jsimd_quantize_float_3dnow): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic) + movd mm7, eax + punpckldq mm7, mm7 ; mm7={12582912.0F 12582912.0F} + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16, 7 +.quantloop: + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)] + pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] + pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] + + pfadd mm0, mm7 ; mm0=(00 ** 01 **) + pfadd mm1, mm7 ; mm1=(02 ** 03 **) + pfadd mm2, mm7 ; mm0=(04 ** 05 **) + pfadd mm3, mm7 ; mm1=(06 ** 07 **) + + movq mm4, mm0 + punpcklwd mm0, mm1 ; mm0=(00 02 ** **) + punpckhwd mm4, mm1 ; mm4=(01 03 ** **) + movq mm5, mm2 + punpcklwd mm2, mm3 ; mm2=(04 06 ** **) + punpckhwd mm5, mm3 ; mm5=(05 07 ** **) + + punpcklwd mm0, mm4 ; mm0=(00 01 02 03) + punpcklwd mm2, mm5 ; mm2=(04 05 06 07) + + movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)] + movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)] + pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] + pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] + + pfadd mm6, mm7 ; mm0=(10 ** 11 **) + pfadd mm1, mm7 ; mm4=(12 ** 13 **) + pfadd mm3, mm7 ; mm0=(14 ** 15 **) + pfadd mm4, mm7 ; mm4=(16 ** 17 **) + + movq mm5, mm6 + punpcklwd mm6, mm1 ; mm6=(10 12 ** **) + punpckhwd mm5, mm1 ; mm5=(11 13 ** **) + movq mm1, mm3 + punpcklwd mm3, mm4 ; mm3=(14 16 ** **) + punpckhwd mm1, mm4 ; mm1=(15 17 ** **) + + punpcklwd mm6, mm5 ; mm6=(10 11 12 13) + punpcklwd mm3, mm1 ; mm3=(14 15 16 17) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz near .quantloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jquant-mmx.asm b/media/libjpeg/simd/i386/jquant-mmx.asm new file mode 100644 index 0000000000..61305c625d --- /dev/null +++ b/media/libjpeg/simd/i386/jquant-mmx.asm @@ -0,0 +1,276 @@ +; +; jquant.asm - sample data conversion and quantization (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +%define sample_data ebp + 8 ; JSAMPARRAY sample_data +%define start_col ebp + 12 ; JDIMENSION start_col +%define workspace ebp + 16 ; DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_mmx) + +EXTN(jsimd_convsamp_mmx): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pxor mm6, mm6 ; mm6=(all 0's) + pcmpeqw mm7, mm7 + psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567) + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF) + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN) + movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV) + + movq mm4, mm0 + punpcklbw mm0, mm6 ; mm0=(0123) + punpckhbw mm4, mm6 ; mm4=(4567) + movq mm5, mm1 + punpcklbw mm1, mm6 ; mm1=(89AB) + punpckhbw mm5, mm6 ; mm5=(CDEF) + + paddw mm0, mm7 + paddw mm4, mm7 + paddw mm1, mm7 + paddw mm5, mm7 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5 + + movq mm0, mm2 + punpcklbw mm2, mm6 ; mm2=(GHIJ) + punpckhbw mm0, mm6 ; mm0=(KLMN) + movq mm4, mm3 + punpcklbw mm3, mm6 ; mm3=(OPQR) + punpckhbw mm4, mm6 ; mm4=(STUV) + + paddw mm2, mm7 + paddw mm0, mm7 + paddw mm3, mm7 + paddw mm4, mm7 + + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4 + + add esi, byte 4*SIZEOF_JSAMPROW + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz short .convloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m, n, b) \ + MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) +%define CORRECTION(m, n, b) \ + MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) +%define SCALE(m, n, b) \ + MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) +%define SHIFT(m, n, b) \ + MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM) + +%define coef_block ebp + 8 ; JCOEFPTR coef_block +%define divisors ebp + 12 ; DCTELEM *divisors +%define workspace ebp + 16 ; DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_mmx) + +EXTN(jsimd_quantize_mmx): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov ah, 2 + alignx 16, 7 +.quantloop1: + mov al, DCTSIZE2/8/2 + alignx 16, 7 +.quantloop2: + movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] + + movq mm0, mm2 + movq mm1, mm3 + + psraw mm2, (WORD_BIT-1) ; -1 if value < 0, 0 otherwise + psraw mm3, (WORD_BIT-1) + + pxor mm0, mm2 ; val = -val + pxor mm1, mm3 + psubw mm0, mm2 + psubw mm1, mm3 + + ; + ; MMX is an annoyingly crappy instruction set. It has two + ; misfeatures that are causing problems here: + ; + ; - All multiplications are signed. + ; + ; - The second operand for the shifts is not treated as packed. + ; + ; + ; We work around the first problem by implementing this algorithm: + ; + ; unsigned long unsigned_multiply(unsigned short x, unsigned short y) + ; { + ; enum { SHORT_BIT = 16 }; + ; signed short sx = (signed short)x; + ; signed short sy = (signed short)y; + ; signed long sz; + ; + ; sz = (long)sx * (long)sy; /* signed multiply */ + ; + ; if (sx < 0) sz += (long)sy << SHORT_BIT; + ; if (sy < 0) sz += (long)sx << SHORT_BIT; + ; + ; return (unsigned long)sz; + ; } + ; + ; (note that a negative sx adds _sy_ and vice versa) + ; + ; For the second problem, we replace the shift by a multiplication. + ; Unfortunately that means we have to deal with the signed issue again. + ; + + paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + paddw mm1, MMWORD [CORRECTION(0,1,edx)] + + movq mm4, mm0 ; store current value for later + movq mm5, mm1 + pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)] + paddw mm0, mm4 ; reciprocal is always negative (MSB=1), + paddw mm1, mm5 ; so we always need to add the initial value + ; (input value is never negative as we + ; inverted it at the start of this routine) + + ; here it gets a bit tricky as both scale + ; and mm0/mm1 can be negative + movq mm6, MMWORD [SCALE(0,0,edx)] ; scale + movq mm7, MMWORD [SCALE(0,1,edx)] + movq mm4, mm0 + movq mm5, mm1 + pmulhw mm0, mm6 + pmulhw mm1, mm7 + + psraw mm6, (WORD_BIT-1) ; determine if scale is negative + psraw mm7, (WORD_BIT-1) + + pand mm6, mm4 ; and add input if it is + pand mm7, mm5 + paddw mm0, mm6 + paddw mm1, mm7 + + psraw mm4, (WORD_BIT-1) ; then check if negative input + psraw mm5, (WORD_BIT-1) + + pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is + pand mm5, MMWORD [SCALE(0,1,edx)] + paddw mm0, mm4 + paddw mm1, mm5 + + pxor mm0, mm2 ; val = -val + pxor mm1, mm3 + psubw mm0, mm2 + psubw mm1, mm3 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1 + + add esi, byte 8*SIZEOF_DCTELEM + add edx, byte 8*SIZEOF_DCTELEM + add edi, byte 8*SIZEOF_JCOEF + dec al + jnz near .quantloop2 + dec ah + jnz near .quantloop1 ; to avoid branch misprediction + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jquant-sse.asm b/media/libjpeg/simd/i386/jquant-sse.asm new file mode 100644 index 0000000000..218adc976f --- /dev/null +++ b/media/libjpeg/simd/i386/jquant-sse.asm @@ -0,0 +1,208 @@ +; +; jquant.asm - sample data conversion and quantization (SSE & MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +%define sample_data ebp + 8 ; JSAMPARRAY sample_data +%define start_col ebp + 12 ; JDIMENSION start_col +%define workspace ebp + 16 ; FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_float_sse) + +EXTN(jsimd_convsamp_float_sse): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw mm7, mm7 + psllw mm7, 7 + packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16, 7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb mm0, mm7 ; mm0=(01234567) + psubb mm1, mm7 ; mm1=(89ABCDEF) + + punpcklbw mm2, mm0 ; mm2=(*0*1*2*3) + punpckhbw mm0, mm0 ; mm0=(*4*5*6*7) + punpcklbw mm3, mm1 ; mm3=(*8*9*A*B) + punpckhbw mm1, mm1 ; mm1=(*C*D*E*F) + + punpcklwd mm4, mm2 ; mm4=(***0***1) + punpckhwd mm2, mm2 ; mm2=(***2***3) + punpcklwd mm5, mm0 ; mm5=(***4***5) + punpckhwd mm0, mm0 ; mm0=(***6***7) + + psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01) + psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23) + cvtpi2ps xmm0, mm4 ; xmm0=(01**) + cvtpi2ps xmm1, mm2 ; xmm1=(23**) + psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45) + psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67) + cvtpi2ps xmm2, mm5 ; xmm2=(45**) + cvtpi2ps xmm3, mm0 ; xmm3=(67**) + + punpcklwd mm6, mm3 ; mm6=(***8***9) + punpckhwd mm3, mm3 ; mm3=(***A***B) + punpcklwd mm4, mm1 ; mm4=(***C***D) + punpckhwd mm1, mm1 ; mm1=(***E***F) + + psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89) + psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB) + cvtpi2ps xmm4, mm6 ; xmm4=(89**) + cvtpi2ps xmm5, mm3 ; xmm5=(AB**) + psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD) + psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF) + cvtpi2ps xmm6, mm4 ; xmm6=(CD**) + cvtpi2ps xmm7, mm1 ; xmm7=(EF**) + + movlhps xmm0, xmm1 ; xmm0=(0123) + movlhps xmm2, xmm3 ; xmm2=(4567) + movlhps xmm4, xmm5 ; xmm4=(89AB) + movlhps xmm6, xmm7 ; xmm6=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .convloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +%define coef_block ebp + 8 ; JCOEFPTR coef_block +%define divisors ebp + 12 ; FAST_FLOAT *divisors +%define workspace ebp + 16 ; FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_float_sse) + +EXTN(jsimd_quantize_float_sse): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16, 7 +.quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + movhlps xmm4, xmm0 + movhlps xmm5, xmm1 + + cvtps2pi mm0, xmm0 + cvtps2pi mm1, xmm1 + cvtps2pi mm4, xmm4 + cvtps2pi mm5, xmm5 + + movhlps xmm6, xmm2 + movhlps xmm7, xmm3 + + cvtps2pi mm2, xmm2 + cvtps2pi mm3, xmm3 + cvtps2pi mm6, xmm6 + cvtps2pi mm7, xmm7 + + packssdw mm0, mm4 + packssdw mm1, mm5 + packssdw mm2, mm6 + packssdw mm3, mm7 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz short .quantloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jquantf-sse2.asm b/media/libjpeg/simd/i386/jquantf-sse2.asm new file mode 100644 index 0000000000..a881ab50f9 --- /dev/null +++ b/media/libjpeg/simd/i386/jquantf-sse2.asm @@ -0,0 +1,168 @@ +; +; jquantf.asm - sample data conversion and quantization (SSE & SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +%define sample_data ebp + 8 ; JSAMPARRAY sample_data +%define start_col ebp + 12 ; JDIMENSION start_col +%define workspace ebp + 16 ; FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_float_sse2) + +EXTN(jsimd_convsamp_float_sse2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw xmm7, xmm7 + psllw xmm7, 7 + packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16, 7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb xmm0, xmm7 ; xmm0=(01234567) + psubb xmm1, xmm7 ; xmm1=(89ABCDEF) + + punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + + punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F) + + psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2, xmm2 ; xmm2=(0123) + cvtdq2ps xmm0, xmm0 ; xmm0=(4567) + psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3, xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz short .convloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +%define coef_block ebp + 8 ; JCOEFPTR coef_block +%define divisors ebp + 12 ; FAST_FLOAT *divisors +%define workspace ebp + 16 ; FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_float_sse2) + +EXTN(jsimd_quantize_float_sse2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16, 7 +.quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz short .quantloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jquanti-avx2.asm b/media/libjpeg/simd/i386/jquanti-avx2.asm new file mode 100644 index 0000000000..5ed6bec246 --- /dev/null +++ b/media/libjpeg/simd/i386/jquanti-avx2.asm @@ -0,0 +1,188 @@ +; +; jquanti.asm - sample data conversion and quantization (AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, 2018, D. R. Commander. +; Copyright (C) 2016, Matthieu Darbois. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +%define sample_data ebp + 8 ; JSAMPARRAY sample_data +%define start_col ebp + 12 ; JDIMENSION start_col +%define workspace ebp + 16 ; DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_avx2) + +EXTN(jsimd_convsamp_avx2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + mov ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + mov ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + vinserti128 ymm0, ymm0, xmm1, 1 + vinserti128 ymm2, ymm2, xmm3, 1 + vinserti128 ymm4, ymm4, xmm5, 1 + vinserti128 ymm6, ymm6, xmm7, 1 + + vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) + vpunpcklbw ymm0, ymm0, ymm1 + vpunpcklbw ymm2, ymm2, ymm1 + vpunpcklbw ymm4, ymm4, ymm1 + vpunpcklbw ymm6, ymm6, ymm1 + + vpcmpeqw ymm7, ymm7, ymm7 + vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm2, ymm2, ymm7 + vpaddw ymm4, ymm4, ymm7 + vpaddw ymm6, ymm6, ymm7 + + vmovdqu YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 + vmovdqu YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2 + vmovdqu YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4 + vmovdqu YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6 + + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m, n, b) \ + YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) +%define CORRECTION(m, n, b) \ + YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) +%define SCALE(m, n, b) \ + YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) + +%define coef_block ebp + 8 ; JCOEFPTR coef_block +%define divisors ebp + 12 ; DCTELEM *divisors +%define workspace ebp + 16 ; DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_avx2) + +EXTN(jsimd_quantize_avx2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + + vmovdqu ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)] + vpabsw ymm0, ymm4 + vpabsw ymm1, ymm5 + vpabsw ymm2, ymm6 + vpabsw ymm3, ymm7 + + vpaddw ymm0, YMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + vpaddw ymm1, YMMWORD [CORRECTION(2,0,edx)] + vpaddw ymm2, YMMWORD [CORRECTION(4,0,edx)] + vpaddw ymm3, YMMWORD [CORRECTION(6,0,edx)] + vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,edx)] + vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,edx)] + vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,edx)] + vpmulhuw ymm0, YMMWORD [SCALE(0,0,edx)] ; scale + vpmulhuw ymm1, YMMWORD [SCALE(2,0,edx)] + vpmulhuw ymm2, YMMWORD [SCALE(4,0,edx)] + vpmulhuw ymm3, YMMWORD [SCALE(6,0,edx)] + + vpsignw ymm0, ymm0, ymm4 + vpsignw ymm1, ymm1, ymm5 + vpsignw ymm2, ymm2, ymm6 + vpsignw ymm3, ymm3, ymm7 + + vmovdqu [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 + vmovdqu [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1 + vmovdqu [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2 + vmovdqu [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3 + + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jquanti-sse2.asm b/media/libjpeg/simd/i386/jquanti-sse2.asm new file mode 100644 index 0000000000..0a509408aa --- /dev/null +++ b/media/libjpeg/simd/i386/jquanti-sse2.asm @@ -0,0 +1,201 @@ +; +; jquanti.asm - sample data conversion and quantization (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +%define sample_data ebp + 8 ; JSAMPARRAY sample_data +%define start_col ebp + 12 ; JDIMENSION start_col +%define workspace ebp + 16 ; DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_sse2) + +EXTN(jsimd_convsamp_sse2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pxor xmm6, xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7, xmm7 + psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) + movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0, xmm6 ; xmm0=(01234567) + punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) + paddw xmm0, xmm7 + paddw xmm1, xmm7 + punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2, xmm7 + paddw xmm3, xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 4*SIZEOF_JSAMPROW + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz short .convloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m, n, b) \ + XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) +%define CORRECTION(m, n, b) \ + XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) +%define SCALE(m, n, b) \ + XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) + +%define coef_block ebp + 8 ; JCOEFPTR coef_block +%define divisors ebp + 12 ; DCTELEM *divisors +%define workspace ebp + 16 ; DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_sse2) + +EXTN(jsimd_quantize_sse2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/32 + alignx 16, 7 +.quantloop: + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + movdqa xmm2, xmm6 + movdqa xmm3, xmm7 + psraw xmm4, (WORD_BIT-1) + psraw xmm5, (WORD_BIT-1) + psraw xmm6, (WORD_BIT-1) + psraw xmm7, (WORD_BIT-1) + pxor xmm0, xmm4 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] + pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] + + pxor xmm0, xmm4 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + psubw xmm0, xmm4 + psubw xmm1, xmm5 + psubw xmm2, xmm6 + psubw xmm3, xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 32*SIZEOF_DCTELEM + add edx, byte 32*SIZEOF_DCTELEM + add edi, byte 32*SIZEOF_JCOEF + dec eax + jnz near .quantloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/i386/jsimd.c b/media/libjpeg/simd/i386/jsimd.c new file mode 100644 index 0000000000..563949a02b --- /dev/null +++ b/media/libjpeg/simd/i386/jsimd.c @@ -0,0 +1,1253 @@ +/* + * jsimd_i386.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 32-bit x86 architecture. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "jconfigint.h" + +/* + * In the PIC cases, we have no guarantee that constants will keep + * their alignment. This macro allows us to verify it at runtime. + */ +#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0) + +#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ +#define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */ + +static unsigned int simd_support = (unsigned int)(~0); +static unsigned int simd_huffman = 1; + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char *env = NULL; +#endif + + if (simd_support != ~0U) + return; + + simd_support = jpeg_simd_cpu_support(); + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCEMMX"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_MMX; + env = getenv("JSIMD_FORCE3DNOW"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_3DNOW | JSIMD_MMX; + env = getenv("JSIMD_FORCESSE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_SSE | JSIMD_MMX; + env = getenv("JSIMD_FORCESSE2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_SSE2; + env = getenv("JSIMD_FORCEAVX2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_AVX2; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; + env = getenv("JSIMD_NOHUFFENC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_huffman = 0; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_extrgb_ycc_convert_avx2; + sse2fct = jsimd_extrgb_ycc_convert_sse2; + mmxfct = jsimd_extrgb_ycc_convert_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_extrgbx_ycc_convert_avx2; + sse2fct = jsimd_extrgbx_ycc_convert_sse2; + mmxfct = jsimd_extrgbx_ycc_convert_mmx; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_extbgr_ycc_convert_avx2; + sse2fct = jsimd_extbgr_ycc_convert_sse2; + mmxfct = jsimd_extbgr_ycc_convert_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_extbgrx_ycc_convert_avx2; + sse2fct = jsimd_extbgrx_ycc_convert_sse2; + mmxfct = jsimd_extbgrx_ycc_convert_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_extxbgr_ycc_convert_avx2; + sse2fct = jsimd_extxbgr_ycc_convert_sse2; + mmxfct = jsimd_extxbgr_ycc_convert_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_extxrgb_ycc_convert_avx2; + sse2fct = jsimd_extxrgb_ycc_convert_sse2; + mmxfct = jsimd_extxrgb_ycc_convert_mmx; + break; + default: + avx2fct = jsimd_rgb_ycc_convert_avx2; + sse2fct = jsimd_rgb_ycc_convert_sse2; + mmxfct = jsimd_rgb_ycc_convert_mmx; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else if (simd_support & JSIMD_SSE2) + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else + mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_extrgb_gray_convert_avx2; + sse2fct = jsimd_extrgb_gray_convert_sse2; + mmxfct = jsimd_extrgb_gray_convert_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_extrgbx_gray_convert_avx2; + sse2fct = jsimd_extrgbx_gray_convert_sse2; + mmxfct = jsimd_extrgbx_gray_convert_mmx; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_extbgr_gray_convert_avx2; + sse2fct = jsimd_extbgr_gray_convert_sse2; + mmxfct = jsimd_extbgr_gray_convert_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_extbgrx_gray_convert_avx2; + sse2fct = jsimd_extbgrx_gray_convert_sse2; + mmxfct = jsimd_extbgrx_gray_convert_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_extxbgr_gray_convert_avx2; + sse2fct = jsimd_extxbgr_gray_convert_sse2; + mmxfct = jsimd_extxbgr_gray_convert_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_extxrgb_gray_convert_avx2; + sse2fct = jsimd_extxrgb_gray_convert_sse2; + mmxfct = jsimd_extxrgb_gray_convert_mmx; + break; + default: + avx2fct = jsimd_rgb_gray_convert_avx2; + sse2fct = jsimd_rgb_gray_convert_sse2; + mmxfct = jsimd_rgb_gray_convert_mmx; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else if (simd_support & JSIMD_SSE2) + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else + mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_ycc_extrgb_convert_avx2; + sse2fct = jsimd_ycc_extrgb_convert_sse2; + mmxfct = jsimd_ycc_extrgb_convert_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_ycc_extrgbx_convert_avx2; + sse2fct = jsimd_ycc_extrgbx_convert_sse2; + mmxfct = jsimd_ycc_extrgbx_convert_mmx; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_ycc_extbgr_convert_avx2; + sse2fct = jsimd_ycc_extbgr_convert_sse2; + mmxfct = jsimd_ycc_extbgr_convert_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_ycc_extbgrx_convert_avx2; + sse2fct = jsimd_ycc_extbgrx_convert_sse2; + mmxfct = jsimd_ycc_extbgrx_convert_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_ycc_extxbgr_convert_avx2; + sse2fct = jsimd_ycc_extxbgr_convert_sse2; + mmxfct = jsimd_ycc_extxbgr_convert_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_ycc_extxrgb_convert_avx2; + sse2fct = jsimd_ycc_extxrgb_convert_sse2; + mmxfct = jsimd_ycc_extxrgb_convert_mmx; + break; + default: + avx2fct = jsimd_ycc_rgb_convert_avx2; + sse2fct = jsimd_ycc_rgb_convert_sse2; + mmxfct = jsimd_ycc_rgb_convert_mmx; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); + else if (simd_support & JSIMD_SSE2) + sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); + else + mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else if (simd_support & JSIMD_SSE2) + jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else + jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else if (simd_support & JSIMD_SSE2) + jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else + jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else if (simd_support & JSIMD_SSE2) + jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else + jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else if (simd_support & JSIMD_SSE2) + jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else + jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_fancy_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_fancy_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else if (simd_support & JSIMD_SSE2) + jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else + jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else if (simd_support & JSIMD_SSE2) + jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else + jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_merged_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_merged_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2; + mmxfct = jsimd_h2v2_extrgb_merged_upsample_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2; + mmxfct = jsimd_h2v2_extrgbx_merged_upsample_mmx; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2; + mmxfct = jsimd_h2v2_extbgr_merged_upsample_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2; + mmxfct = jsimd_h2v2_extbgrx_merged_upsample_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2; + mmxfct = jsimd_h2v2_extxbgr_merged_upsample_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2; + mmxfct = jsimd_h2v2_extxrgb_merged_upsample_mmx; + break; + default: + avx2fct = jsimd_h2v2_merged_upsample_avx2; + sse2fct = jsimd_h2v2_merged_upsample_sse2; + mmxfct = jsimd_h2v2_merged_upsample_mmx; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else if (simd_support & JSIMD_SSE2) + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else + mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2; + mmxfct = jsimd_h2v1_extrgb_merged_upsample_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2; + mmxfct = jsimd_h2v1_extrgbx_merged_upsample_mmx; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2; + mmxfct = jsimd_h2v1_extbgr_merged_upsample_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2; + mmxfct = jsimd_h2v1_extbgrx_merged_upsample_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2; + mmxfct = jsimd_h2v1_extxbgr_merged_upsample_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2; + mmxfct = jsimd_h2v1_extxrgb_merged_upsample_mmx; + break; + default: + avx2fct = jsimd_h2v1_merged_upsample_avx2; + sse2fct = jsimd_h2v1_merged_upsample_sse2; + mmxfct = jsimd_h2v1_merged_upsample_mmx; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else if (simd_support & JSIMD_SSE2) + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else + mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_SSE) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + if (simd_support & JSIMD_AVX2) + jsimd_convsamp_avx2(sample_data, start_col, workspace); + else if (simd_support & JSIMD_SSE2) + jsimd_convsamp_sse2(sample_data, start_col, workspace); + else + jsimd_convsamp_mmx(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ + if (simd_support & JSIMD_SSE2) + jsimd_convsamp_float_sse2(sample_data, start_col, workspace); + else if (simd_support & JSIMD_SSE) + jsimd_convsamp_float_sse(sample_data, start_col, workspace); + else + jsimd_convsamp_float_3dnow(sample_data, start_col, workspace); +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + if (simd_support & JSIMD_AVX2) + jsimd_fdct_islow_avx2(data); + else if (simd_support & JSIMD_SSE2) + jsimd_fdct_islow_sse2(data); + else + jsimd_fdct_islow_mmx(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) + jsimd_fdct_ifast_sse2(data); + else + jsimd_fdct_ifast_mmx(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) + jsimd_fdct_float_sse(data); + else if (simd_support & JSIMD_3DNOW) + jsimd_fdct_float_3dnow(data); +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_SSE) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + if (simd_support & JSIMD_AVX2) + jsimd_quantize_avx2(coef_block, divisors, workspace); + else if (simd_support & JSIMD_SSE2) + jsimd_quantize_sse2(coef_block, divisors, workspace); + else + jsimd_quantize_mmx(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ + if (simd_support & JSIMD_SSE2) + jsimd_quantize_float_sse2(coef_block, divisors, workspace); + else if (simd_support & JSIMD_SSE) + jsimd_quantize_float_sse(coef_block, divisors, workspace); + else + jsimd_quantize_float_3dnow(coef_block, divisors, workspace); +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else + jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else + jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + if (sizeof(FLOAT_MULT_TYPE) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) + return 1; + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse)) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_AVX2) + jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf, + output_col); + else if (simd_support & JSIMD_SSE2) + jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else + jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) + jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else + jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) + jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse)) + jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf, + output_col); + else + jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && simd_huffman && + IS_ALIGNED_SSE(jconst_huff_encode_one_block)) + return 1; + + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val, + dctbl, actbl); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (SIZEOF_SIZE_T != 4) + return 0; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ + jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start, + Sl, Al, values, zerobits); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (SIZEOF_SIZE_T != 4) + return 0; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return jsimd_encode_mcu_AC_refine_prepare_sse2(block, + jpeg_natural_order_start, + Sl, Al, absvalues, bits); +} diff --git a/media/libjpeg/simd/i386/jsimdcpu.asm b/media/libjpeg/simd/i386/jsimdcpu.asm new file mode 100644 index 0000000000..ddcafa9e21 --- /dev/null +++ b/media/libjpeg/simd/i386/jsimdcpu.asm @@ -0,0 +1,135 @@ +; +; jsimdcpu.asm - SIMD instruction support check +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Check if the CPU supports SIMD instructions +; +; GLOBAL(unsigned int) +; jpeg_simd_cpu_support(void) +; + + align 32 + GLOBAL_FUNCTION(jpeg_simd_cpu_support) + +EXTN(jpeg_simd_cpu_support): + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused + push edi + + xor edi, edi ; simd support flag + + pushfd + pop eax + mov edx, eax + xor eax, 1<<21 ; flip ID bit in EFLAGS + push eax + popfd + pushfd + pop eax + xor eax, edx + jz near .return ; CPUID is not supported + + ; Check whether CPUID leaf 07H is supported + ; (leaf 07H is used to check for AVX2 instruction support) + xor eax, eax + cpuid + test eax, eax + jz near .return + cmp eax, 7 + jl short .no_avx2 ; Maximum leaf < 07H + + ; Check for AVX2 instruction support + mov eax, 7 + xor ecx, ecx + cpuid + mov eax, ebx + test eax, 1<<5 ; bit5:AVX2 + jz short .no_avx2 + + ; Check for AVX2 O/S support + mov eax, 1 + xor ecx, ecx + cpuid + test ecx, 1<<27 + jz short .no_avx2 ; O/S does not support XSAVE + test ecx, 1<<28 + jz short .no_avx2 ; CPU does not support AVX2 + + xor ecx, ecx + xgetbv + and eax, 6 + cmp eax, 6 ; O/S does not manage XMM/YMM state + ; using XSAVE + jnz short .no_avx2 + + or edi, JSIMD_AVX2 +.no_avx2: + + ; Check CPUID leaf 01H for MMX, SSE, and SSE2 support + xor eax, eax + inc eax + cpuid + mov eax, edx ; eax = Standard feature flags + + ; Check for MMX instruction support + test eax, 1<<23 ; bit23:MMX + jz short .no_mmx + or edi, byte JSIMD_MMX +.no_mmx: + test eax, 1<<25 ; bit25:SSE + jz short .no_sse + or edi, byte JSIMD_SSE +.no_sse: + test eax, 1<<26 ; bit26:SSE2 + jz short .no_sse2 + or edi, byte JSIMD_SSE2 +.no_sse2: + + ; Check for 3DNow! instruction support + mov eax, 0x80000000 + cpuid + cmp eax, 0x80000000 + jbe short .return + + mov eax, 0x80000001 + cpuid + mov eax, edx ; eax = Extended feature flags + + test eax, 1<<31 ; bit31:3DNow!(vendor independent) + jz short .no_3dnow + or edi, byte JSIMD_3DNOW +.no_3dnow: + +.return: + mov eax, edi + + pop edi +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/jsimd.h b/media/libjpeg/simd/jsimd.h new file mode 100644 index 0000000000..fdcc61ebf6 --- /dev/null +++ b/media/libjpeg/simd/jsimd.h @@ -0,0 +1,1083 @@ +/* + * simd/jsimd.h + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2011, 2014-2016, 2018, 2020, D. R. Commander. + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * Copyright (C) 2014, Linaro Limited. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + */ + +/* Bitmask for supported acceleration methods */ + +#define JSIMD_NONE 0x00 +#define JSIMD_MMX 0x01 +#define JSIMD_3DNOW 0x02 +#define JSIMD_SSE 0x04 +#define JSIMD_SSE2 0x08 +#define JSIMD_NEON 0x10 +#define JSIMD_DSPR2 0x20 +#define JSIMD_ALTIVEC 0x40 +#define JSIMD_AVX2 0x80 +#define JSIMD_MMI 0x100 + +/* SIMD Ext: retrieve SIMD/CPU information */ +EXTERN(unsigned int) jpeg_simd_cpu_support(void); + +/* RGB & extended RGB --> YCC Colorspace Conversion */ +EXTERN(void) jsimd_rgb_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +extern const int jconst_rgb_ycc_convert_sse2[]; +EXTERN(void) jsimd_rgb_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +extern const int jconst_rgb_ycc_convert_avx2[]; +EXTERN(void) jsimd_rgb_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_mmi + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +/* RGB & extended RGB --> Grayscale Colorspace Conversion */ +EXTERN(void) jsimd_rgb_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +extern const int jconst_rgb_gray_convert_sse2[]; +EXTERN(void) jsimd_rgb_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +extern const int jconst_rgb_gray_convert_avx2[]; +EXTERN(void) jsimd_rgb_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_avx2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +/* YCC --> RGB & extended RGB Colorspace Conversion */ +EXTERN(void) jsimd_ycc_rgb_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +extern const int jconst_ycc_rgb_convert_sse2[]; +EXTERN(void) jsimd_ycc_rgb_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +extern const int jconst_ycc_rgb_convert_avx2[]; +EXTERN(void) jsimd_ycc_rgb_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_rgb565_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_mmi + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +/* NULL Colorspace Conversion */ +EXTERN(void) jsimd_c_null_convert_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows, int num_components); + +/* h2v1 Downsampling */ +EXTERN(void) jsimd_h2v1_downsample_mmx + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_sse2 + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_avx2 + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_neon + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_dspr2 + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_altivec + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +/* h2v2 Downsampling */ +EXTERN(void) jsimd_h2v2_downsample_mmx + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_sse2 + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_avx2 + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_neon + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_dspr2 + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_mmi + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_altivec + (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); + +/* h2v2 Smooth Downsampling */ +EXTERN(void) jsimd_h2v2_smooth_downsample_dspr2 + (JSAMPARRAY input_data, JSAMPARRAY output_data, JDIMENSION v_samp_factor, + int max_v_samp_factor, int smoothing_factor, JDIMENSION width_in_blocks, + JDIMENSION image_width); + + +/* Upsampling */ +EXTERN(void) jsimd_h2v1_upsample_mmx + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_mmx + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_upsample_sse2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_sse2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_upsample_avx2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_avx2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_upsample_dspr2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_dspr2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_int_upsample_dspr2 + (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr, JDIMENSION output_width, + int max_v_samp_factor); + +EXTERN(void) jsimd_h2v1_upsample_altivec + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_altivec + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +/* Fancy Upsampling */ +EXTERN(void) jsimd_h2v1_fancy_upsample_mmx + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_mmx + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +extern const int jconst_fancy_upsample_sse2[]; +EXTERN(void) jsimd_h2v1_fancy_upsample_sse2 + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_sse2 + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +extern const int jconst_fancy_upsample_avx2[]; +EXTERN(void) jsimd_h2v1_fancy_upsample_avx2 + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_avx2 + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_neon + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2 + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_dspr2 + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v2_fancy_upsample_mmi + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_altivec + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_altivec + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +/* Merged Upsampling */ +EXTERN(void) jsimd_h2v1_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +extern const int jconst_merged_upsample_sse2[]; +EXTERN(void) jsimd_h2v1_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +extern const int jconst_merged_upsample_avx2[]; +EXTERN(void) jsimd_h2v1_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v1_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); + +EXTERN(void) jsimd_h2v2_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf, JSAMPLE *range); + +EXTERN(void) jsimd_h2v1_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +/* Sample Conversion */ +EXTERN(void) jsimd_convsamp_mmx + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_sse2 + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_avx2 + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_neon + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_dspr2 + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_altivec + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +/* Floating Point Sample Conversion */ +EXTERN(void) jsimd_convsamp_float_3dnow + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_convsamp_float_sse + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_convsamp_float_sse2 + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_convsamp_float_dspr2 + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +/* Accurate Integer Forward DCT */ +EXTERN(void) jsimd_fdct_islow_mmx(DCTELEM *data); + +extern const int jconst_fdct_islow_sse2[]; +EXTERN(void) jsimd_fdct_islow_sse2(DCTELEM *data); + +extern const int jconst_fdct_islow_avx2[]; +EXTERN(void) jsimd_fdct_islow_avx2(DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_neon(DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_dspr2(DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_mmi(DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_altivec(DCTELEM *data); + +/* Fast Integer Forward DCT */ +EXTERN(void) jsimd_fdct_ifast_mmx(DCTELEM *data); + +extern const int jconst_fdct_ifast_sse2[]; +EXTERN(void) jsimd_fdct_ifast_sse2(DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_neon(DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data); + +/* Floating Point Forward DCT */ +EXTERN(void) jsimd_fdct_float_3dnow(FAST_FLOAT *data); + +extern const int jconst_fdct_float_sse[]; +EXTERN(void) jsimd_fdct_float_sse(FAST_FLOAT *data); + +/* Quantization */ +EXTERN(void) jsimd_quantize_mmx + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_sse2 + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_avx2 + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_neon + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_dspr2 + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_mmi + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_altivec + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +/* Floating Point Quantization */ +EXTERN(void) jsimd_quantize_float_3dnow + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_quantize_float_sse + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_quantize_float_sse2 + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_quantize_float_dspr2 + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +/* Scaled Inverse DCT */ +EXTERN(void) jsimd_idct_2x2_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_red_sse2[]; +EXTERN(void) jsimd_idct_2x2_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_2x2_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_2x2_dspr2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_dspr2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col, int *workspace); +EXTERN(void) jsimd_idct_6x6_dspr2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_12x12_pass1_dspr2 + (JCOEFPTR coef_block, void *dct_table, int *workspace); +EXTERN(void) jsimd_idct_12x12_pass2_dspr2 + (int *workspace, int *output); + +/* Accurate Integer Inverse DCT */ +EXTERN(void) jsimd_idct_islow_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_islow_sse2[]; +EXTERN(void) jsimd_idct_islow_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_islow_avx2[]; +EXTERN(void) jsimd_idct_islow_avx2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_islow_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_islow_dspr2 + (void *dct_table, JCOEFPTR coef_block, int *output_buf, JSAMPLE *output_col); + +EXTERN(void) jsimd_idct_islow_mmi + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_islow_altivec + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +/* Fast Integer Inverse DCT */ +EXTERN(void) jsimd_idct_ifast_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_ifast_sse2[]; +EXTERN(void) jsimd_idct_ifast_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_ifast_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_ifast_cols_dspr2 + (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr, + const int *idct_coefs); +EXTERN(void) jsimd_idct_ifast_rows_dspr2 + (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col, + const int *idct_coefs); + +EXTERN(void) jsimd_idct_ifast_altivec + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +/* Floating Point Inverse DCT */ +EXTERN(void) jsimd_idct_float_3dnow + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_float_sse[]; +EXTERN(void) jsimd_idct_float_sse + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_float_sse2[]; +EXTERN(void) jsimd_idct_float_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +/* Huffman coding */ +extern const int jconst_huff_encode_one_block[]; +EXTERN(JOCTET *) jsimd_huff_encode_one_block_sse2 + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); + +EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); + +EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); + +/* Progressive Huffman encoding */ +EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2 + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *values, size_t *zerobits); + +EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2 + (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, + JCOEF *absvalues, size_t *bits); diff --git a/media/libjpeg/simd/loongson/jccolext-mmi.c b/media/libjpeg/simd/loongson/jccolext-mmi.c new file mode 100644 index 0000000000..6cdeb5e09a --- /dev/null +++ b/media/libjpeg/simd/loongson/jccolext-mmi.c @@ -0,0 +1,483 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen <zhuchen@loongson.cn> + * SunZhangzhi <sunzhangzhi-cq@loongson.cn> + * CaiWanwei <caiwanwei@loongson.cn> + * ZhangLixia <zhanglixia-hf@loongson.cn> + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jccolor-mmi.c */ + + +#if RGB_RED == 0 +#define mmA mm0 +#define mmB mm1 +#elif RGB_GREEN == 0 +#define mmA mm2 +#define mmB mm3 +#elif RGB_BLUE == 0 +#define mmA mm4 +#define mmB mm5 +#else +#define mmA mm6 +#define mmB mm7 +#endif + +#if RGB_RED == 1 +#define mmC mm0 +#define mmD mm1 +#elif RGB_GREEN == 1 +#define mmC mm2 +#define mmD mm3 +#elif RGB_BLUE == 1 +#define mmC mm4 +#define mmD mm5 +#else +#define mmC mm6 +#define mmD mm7 +#endif + +#if RGB_RED == 2 +#define mmE mm0 +#define mmF mm1 +#elif RGB_GREEN == 2 +#define mmE mm2 +#define mmF mm3 +#elif RGB_BLUE == 2 +#define mmE mm4 +#define mmF mm5 +#else +#define mmE mm6 +#define mmF mm7 +#endif + +#if RGB_RED == 3 +#define mmG mm0 +#define mmH mm1 +#elif RGB_GREEN == 3 +#define mmG mm2 +#define mmH mm3 +#elif RGB_BLUE == 3 +#define mmG mm4 +#define mmH mm5 +#else +#define mmG mm6 +#define mmH mm7 +#endif + + +void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + JSAMPROW inptr, outptr0, outptr1, outptr2; + int num_cols, col; + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; + __m64 wk[7]; + __m64 Y_BG, Cb_RG, Cr_BG; + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + + for (num_cols = image_width; num_cols > 0; num_cols -= 8, + outptr0 += 8, outptr1 += 8, outptr2 += 8) { + +#if RGB_PIXELSIZE == 3 + + if (num_cols < 8) { + col = num_cols * 3; + asm(".set noreorder\r\n" + + "li $8, 1\r\n" + "move $9, %3\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 1f\r\n" + "nop \r\n" + "subu $9, $9, 1\r\n" + "xor $12, $12, $12\r\n" + "move $13, %5\r\n" + "dadd $13, $13, $9\r\n" + "lbu $12, 0($13)\r\n" + + "1: \r\n" + "li $8, 2\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 2f\r\n" + "nop \r\n" + "subu $9, $9, 2\r\n" + "xor $11, $11, $11\r\n" + "move $13, %5\r\n" + "dadd $13, $13, $9\r\n" + "lhu $11, 0($13)\r\n" + "sll $12, $12, 16\r\n" + "or $12, $12, $11\r\n" + + "2: \r\n" + "dmtc1 $12, %0\r\n" + "li $8, 4\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 3f\r\n" + "nop \r\n" + "subu $9, $9, 4\r\n" + "move $13, %5\r\n" + "dadd $13, $13, $9\r\n" + "lwu $14, 0($13)\r\n" + "dmtc1 $14, %1\r\n" + "dsll32 $12, $12, 0\r\n" + "or $12, $12, $14\r\n" + "dmtc1 $12, %0\r\n" + + "3: \r\n" + "li $8, 8\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 4f\r\n" + "nop \r\n" + "mov.s %1, %0\r\n" + "ldc1 %0, 0(%5)\r\n" + "li $9, 8\r\n" + "j 5f\r\n" + "nop \r\n" + + "4: \r\n" + "li $8, 16\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 5f\r\n" + "nop \r\n" + "mov.s %2, %0\r\n" + "ldc1 %0, 0(%5)\r\n" + "ldc1 %1, 8(%5)\r\n" + + "5: \r\n" + "nop \r\n" + ".set reorder\r\n" + + : "=f" (mmA), "=f" (mmG), "=f" (mmF) + : "r" (col), "r" (num_rows), "r" (inptr) + : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13", + "$14", "memory" + ); + } else { + if (!(((long)inptr) & 7)) { + mmA = _mm_load_si64((__m64 *)&inptr[0]); + mmG = _mm_load_si64((__m64 *)&inptr[8]); + mmF = _mm_load_si64((__m64 *)&inptr[16]); + } else { + mmA = _mm_loadu_si64((__m64 *)&inptr[0]); + mmG = _mm_loadu_si64((__m64 *)&inptr[8]); + mmF = _mm_loadu_si64((__m64 *)&inptr[16]); + } + inptr += RGB_PIXELSIZE * 8; + } + mmD = mmA; + mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); + mmD = _mm_srli_si64(mmD, 4 * BYTE_BIT); + + mmA = _mm_unpackhi_pi8(mmA, mmG); + mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT); + + mmD = _mm_unpacklo_pi8(mmD, mmF); + mmG = _mm_unpackhi_pi8(mmG, mmF); + + mmE = mmA; + mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); + mmE = _mm_srli_si64(mmE, 4 * BYTE_BIT); + + mmA = _mm_unpackhi_pi8(mmA, mmD); + mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT); + + mmE = _mm_unpacklo_pi8(mmE, mmG); + mmD = _mm_unpackhi_pi8(mmD, mmG); + mmC = mmA; + mmA = _mm_loadlo_pi8_f(mmA); + mmC = _mm_loadhi_pi8_f(mmC); + + mmB = mmE; + mmE = _mm_loadlo_pi8_f(mmE); + mmB = _mm_loadhi_pi8_f(mmB); + + mmF = mmD; + mmD = _mm_loadlo_pi8_f(mmD); + mmF = _mm_loadhi_pi8_f(mmF); + +#else /* RGB_PIXELSIZE == 4 */ + + if (num_cols < 8) { + col = num_cols; + asm(".set noreorder\r\n" + + "li $8, 1\r\n" + "move $9, %4\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 1f\r\n" + "nop \r\n" + "subu $9, $9, 1\r\n" + "dsll $11, $9, 2\r\n" + "move $13, %5\r\n" + "daddu $13, $13, $11\r\n" + "lwc1 %0, 0($13)\r\n" + + "1: \r\n" + "li $8, 2\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 2f\r\n" + "nop \r\n" + "subu $9, $9, 2\r\n" + "dsll $11, $9, 2\r\n" + "move $13, %5\r\n" + "daddu $13, $13, $11\r\n" + "mov.s %1, %0\r\n" + "ldc1 %0, 0($13)\r\n" + + "2: \r\n" + "li $8, 4\r\n" + "and $10, $9, $8\r\n" + "beqz $10, 3f\r\n" + "nop \r\n" + "mov.s %2, %0\r\n" + "mov.s %3, %1\r\n" + "ldc1 %0, 0(%5)\r\n" + "ldc1 %1, 8(%5)\r\n" + + "3: \r\n" + "nop \r\n" + ".set reorder\r\n" + + : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC) + : "r" (col), "r" (inptr) + : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory" + ); + } else { + if (!(((long)inptr) & 7)) { + mmA = _mm_load_si64((__m64 *)&inptr[0]); + mmF = _mm_load_si64((__m64 *)&inptr[8]); + mmD = _mm_load_si64((__m64 *)&inptr[16]); + mmC = _mm_load_si64((__m64 *)&inptr[24]); + } else { + mmA = _mm_loadu_si64((__m64 *)&inptr[0]); + mmF = _mm_loadu_si64((__m64 *)&inptr[8]); + mmD = _mm_loadu_si64((__m64 *)&inptr[16]); + mmC = _mm_loadu_si64((__m64 *)&inptr[24]); + } + inptr += RGB_PIXELSIZE * 8; + } + mmB = mmA; + mmA = _mm_unpacklo_pi8(mmA, mmF); + mmB = _mm_unpackhi_pi8(mmB, mmF); + + mmG = mmD; + mmD = _mm_unpacklo_pi8(mmD, mmC); + mmG = _mm_unpackhi_pi8(mmG, mmC); + + mmE = mmA; + mmA = _mm_unpacklo_pi16(mmA, mmD); + mmE = _mm_unpackhi_pi16(mmE, mmD); + + mmH = mmB; + mmB = _mm_unpacklo_pi16(mmB, mmG); + mmH = _mm_unpackhi_pi16(mmH, mmG); + + mmC = mmA; + mmA = _mm_loadlo_pi8_f(mmA); + mmC = _mm_loadhi_pi8_f(mmC); + + mmD = mmB; + mmB = _mm_loadlo_pi8_f(mmB); + mmD = _mm_loadhi_pi8_f(mmD); + + mmG = mmE; + mmE = _mm_loadlo_pi8_f(mmE); + mmG = _mm_loadhi_pi8_f(mmG); + + mmF = mmH; + mmF = _mm_unpacklo_pi8(mmF, mmH); + mmH = _mm_unpackhi_pi8(mmH, mmH); + mmF = _mm_srli_pi16(mmF, BYTE_BIT); + mmH = _mm_srli_pi16(mmH, BYTE_BIT); + +#endif + + wk[0] = mm0; + wk[1] = mm1; + wk[2] = mm4; + wk[3] = mm5; + + mm6 = mm1; + mm1 = _mm_unpacklo_pi16(mm1, mm3); + mm6 = _mm_unpackhi_pi16(mm6, mm3); + mm7 = mm1; + mm4 = mm6; + mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337); + mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337); + mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033); + mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033); + + wk[4] = mm1; + wk[5] = mm6; + + mm1 = _mm_loadlo_pi16_f(mm5); + mm6 = _mm_loadhi_pi16_f(mm5); + mm1 = _mm_srli_pi32(mm1, 1); + mm6 = _mm_srli_pi32(mm6, 1); + + mm5 = PD_ONEHALFM1_CJ; + mm7 = _mm_add_pi32(mm7, mm1); + mm4 = _mm_add_pi32(mm4, mm6); + mm7 = _mm_add_pi32(mm7, mm5); + mm4 = _mm_add_pi32(mm4, mm5); + mm7 = _mm_srli_pi32(mm7, SCALEBITS); + mm4 = _mm_srli_pi32(mm4, SCALEBITS); + mm7 = _mm_packs_pi32(mm7, mm4); + + mm1 = wk[2]; + mm6 = mm0; + mm0 = _mm_unpacklo_pi16(mm0, mm2); + mm6 = _mm_unpackhi_pi16(mm6, mm2); + mm5 = mm0; + mm4 = mm6; + mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337); + mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337); + mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033); + mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033); + + wk[6] = mm0; + wk[7] = mm6; + mm0 = _mm_loadlo_pi16_f(mm1); + mm6 = _mm_loadhi_pi16_f(mm1); + mm0 = _mm_srli_pi32(mm0, 1); + mm6 = _mm_srli_pi32(mm6, 1); + + mm1 = PD_ONEHALFM1_CJ; + mm5 = _mm_add_pi32(mm5, mm0); + mm4 = _mm_add_pi32(mm4, mm6); + mm5 = _mm_add_pi32(mm5, mm1); + mm4 = _mm_add_pi32(mm4, mm1); + mm5 = _mm_srli_pi32(mm5, SCALEBITS); + mm4 = _mm_srli_pi32(mm4, SCALEBITS); + mm5 = _mm_packs_pi32(mm5, mm4); + + mm7 = _mm_slli_pi16(mm7, BYTE_BIT); + mm5 = _mm_or_si64(mm5, mm7); + Cb_RG = mm5; + + mm0 = wk[3]; + mm6 = wk[2]; + mm1 = wk[1]; + + mm4 = mm0; + mm0 = _mm_unpacklo_pi16(mm0, mm3); + mm4 = _mm_unpackhi_pi16(mm4, mm3); + mm7 = mm0; + mm5 = mm4; + mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250); + mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250); + mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041); + mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041); + + mm3 = PD_ONEHALF; + mm0 = _mm_add_pi32(mm0, wk[4]); + mm4 = _mm_add_pi32(mm4, wk[5]); + mm0 = _mm_add_pi32(mm0, mm3); + mm4 = _mm_add_pi32(mm4, mm3); + mm0 = _mm_srli_pi32(mm0, SCALEBITS); + mm4 = _mm_srli_pi32(mm4, SCALEBITS); + mm0 = _mm_packs_pi32(mm0, mm4); + + mm3 = _mm_loadlo_pi16_f(mm1); + mm4 = _mm_loadhi_pi16_f(mm1); + mm3 = _mm_srli_pi32(mm3, 1); + mm4 = _mm_srli_pi32(mm4, 1); + + mm1 = PD_ONEHALFM1_CJ; + mm7 = _mm_add_pi32(mm7, mm3); + mm5 = _mm_add_pi32(mm5, mm4); + mm7 = _mm_add_pi32(mm7, mm1); + mm5 = _mm_add_pi32(mm5, mm1); + mm7 = _mm_srli_pi32(mm7, SCALEBITS); + mm5 = _mm_srli_pi32(mm5, SCALEBITS); + mm7 = _mm_packs_pi32(mm7, mm5); + + mm3 = wk[0]; + mm4 = mm6; + mm6 = _mm_unpacklo_pi16(mm6, mm2); + mm4 = _mm_unpackhi_pi16(mm4, mm2); + mm1 = mm6; + mm5 = mm4; + mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250); + mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250); + mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041); + mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041); + + mm2 = PD_ONEHALF; + mm6 = _mm_add_pi32(mm6, wk[6]); + mm4 = _mm_add_pi32(mm4, wk[7]); + mm6 = _mm_add_pi32(mm6, mm2); + mm4 = _mm_add_pi32(mm4, mm2); + mm6 = _mm_srli_pi32(mm6, SCALEBITS); + mm4 = _mm_srli_pi32(mm4, SCALEBITS); + mm6 = _mm_packs_pi32(mm6, mm4); + + mm0 = _mm_slli_pi16(mm0, BYTE_BIT); + mm6 = _mm_or_si64(mm6, mm0); + Y_BG = mm6; + + mm2 = _mm_loadlo_pi16_f(mm3); + mm4 = _mm_loadhi_pi16_f(mm3); + mm2 = _mm_srli_pi32(mm2, 1); + mm4 = _mm_srli_pi32(mm4, 1); + + mm0 = PD_ONEHALFM1_CJ; + mm1 = _mm_add_pi32(mm1, mm2); + mm5 = _mm_add_pi32(mm5, mm4); + mm1 = _mm_add_pi32(mm1, mm0); + mm5 = _mm_add_pi32(mm5, mm0); + mm1 = _mm_srli_pi32(mm1, SCALEBITS); + mm5 = _mm_srli_pi32(mm5, SCALEBITS); + mm1 = _mm_packs_pi32(mm1, mm5); + + mm7 = _mm_slli_pi16(mm7, BYTE_BIT); + mm1 = _mm_or_si64(mm1, mm7); + Cr_BG = mm1; + + _mm_store_si64((__m64 *)&outptr0[0], Y_BG); + _mm_store_si64((__m64 *)&outptr1[0], Cb_RG); + _mm_store_si64((__m64 *)&outptr2[0], Cr_BG); + } + } +} + +#undef mmA +#undef mmB +#undef mmC +#undef mmD +#undef mmE +#undef mmF +#undef mmG +#undef mmH diff --git a/media/libjpeg/simd/loongson/jccolor-mmi.c b/media/libjpeg/simd/loongson/jccolor-mmi.c new file mode 100644 index 0000000000..93ef5c79f7 --- /dev/null +++ b/media/libjpeg/simd/loongson/jccolor-mmi.c @@ -0,0 +1,148 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2011, 2014, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen <zhuchen@loongson.cn> + * CaiWanwei <caiwanwei@loongson.cn> + * SunZhangzhi <sunzhangzhi-cq@loongson.cn> + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* RGB --> YCC CONVERSION */ + +#include "jsimd_mmi.h" + + +#define F_0_081 ((short)5329) /* FIX(0.08131) */ +#define F_0_114 ((short)7471) /* FIX(0.11400) */ +#define F_0_168 ((short)11059) /* FIX(0.16874) */ +#define F_0_250 ((short)16384) /* FIX(0.25000) */ +#define F_0_299 ((short)19595) /* FIX(0.29900) */ +#define F_0_331 ((short)21709) /* FIX(0.33126) */ +#define F_0_418 ((short)27439) /* FIX(0.41869) */ +#define F_0_587 ((short)38470) /* FIX(0.58700) */ +#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */ + +enum const_index { + index_PD_ONEHALF, + index_PW_F0299_F0337, + index_PW_F0114_F0250, + index_PW_MF016_MF033, + index_PW_MF008_MF041, + index_PD_ONEHALFM1_CJ +}; + +static uint64_t const_value[] = { + _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))), + _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299), + _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114), + _uint64_set_pi16(-F_0_331, -F_0_168, -F_0_331, -F_0_168), + _uint64_set_pi16(-F_0_418, -F_0_081, -F_0_418, -F_0_081), + _uint64_set_pi32(((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)), + ((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS))) +}; + +#define get_const_value(index) (*(__m64 *)&const_value[index]) + +#define PD_ONEHALF get_const_value(index_PD_ONEHALF) +#define PW_F0299_F0337 get_const_value(index_PW_F0299_F0337) +#define PW_F0114_F0250 get_const_value(index_PW_F0114_F0250) +#define PW_MF016_MF033 get_const_value(index_PW_MF016_MF033) +#define PW_MF008_MF041 get_const_value(index_PW_MF008_MF041) +#define PD_ONEHALFM1_CJ get_const_value(index_PD_ONEHALFM1_CJ) + + +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_ycc_convert_mmi jsimd_extrgb_ycc_convert_mmi +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_mmi + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_rgb_ycc_convert_mmi jsimd_extrgbx_ycc_convert_mmi +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_mmi + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_rgb_ycc_convert_mmi jsimd_extbgr_ycc_convert_mmi +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_mmi + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_rgb_ycc_convert_mmi jsimd_extbgrx_ycc_convert_mmi +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_mmi + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_rgb_ycc_convert_mmi jsimd_extxbgr_ycc_convert_mmi +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_mmi + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_rgb_ycc_convert_mmi jsimd_extxrgb_ycc_convert_mmi +#include "jccolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_mmi diff --git a/media/libjpeg/simd/loongson/jcsample-mmi.c b/media/libjpeg/simd/loongson/jcsample-mmi.c new file mode 100644 index 0000000000..2f2d85196c --- /dev/null +++ b/media/libjpeg/simd/loongson/jcsample-mmi.c @@ -0,0 +1,100 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen <zhuchen@loongson.cn> + * CaiWanwei <caiwanwei@loongson.cn> + * SunZhangzhi <sunzhangzhi-cq@loongson.cn> + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* CHROMA DOWNSAMPLING */ + +#include "jsimd_mmi.h" +#include "jcsample.h" + + +void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int inrow, outrow, outcol, bias; + JDIMENSION output_cols = width_in_blocks * DCTSIZE; + JSAMPROW inptr0, inptr1, outptr; + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6 = 0.0, mm7; + + expand_right_edge(input_data, max_v_samp_factor, image_width, + output_cols * 2); + + bias = (1 << 17) + 1; /* 0x00020001 (bias pattern) */ + mm7 = _mm_set1_pi32(bias); /* mm7={1, 2, 1, 2} */ + mm6 = _mm_cmpeq_pi16(mm6, mm6); + mm6 = _mm_srli_pi16(mm6, BYTE_BIT); /* mm6={0xFF 0x00 0xFF 0x00 ..} */ + + for (inrow = 0, outrow = 0; outrow < v_samp_factor; + inrow += 2, outrow++) { + + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow + 1]; + outptr = output_data[outrow]; + + for (outcol = output_cols; outcol > 0; + outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) { + + mm0 = _mm_load_si64((__m64 *)&inptr0[0]); + mm1 = _mm_load_si64((__m64 *)&inptr1[0]); + mm2 = _mm_load_si64((__m64 *)&inptr0[8]); + mm3 = _mm_load_si64((__m64 *)&inptr1[8]); + + mm4 = mm0; + mm5 = mm1; + mm0 = _mm_and_si64(mm0, mm6); + mm4 = _mm_srli_pi16(mm4, BYTE_BIT); + mm1 = _mm_and_si64(mm1, mm6); + mm5 = _mm_srli_pi16(mm5, BYTE_BIT); + mm0 = _mm_add_pi16(mm0, mm4); + mm1 = _mm_add_pi16(mm1, mm5); + + mm4 = mm2; + mm5 = mm3; + mm2 = _mm_and_si64(mm2, mm6); + mm4 = _mm_srli_pi16(mm4, BYTE_BIT); + mm3 = _mm_and_si64(mm3, mm6); + mm5 = _mm_srli_pi16(mm5, BYTE_BIT); + mm2 = _mm_add_pi16(mm2, mm4); + mm3 = _mm_add_pi16(mm3, mm5); + + mm0 = _mm_add_pi16(mm0, mm1); + mm2 = _mm_add_pi16(mm2, mm3); + mm0 = _mm_add_pi16(mm0, mm7); + mm2 = _mm_add_pi16(mm2, mm7); + mm0 = _mm_srli_pi16(mm0, 2); + mm2 = _mm_srli_pi16(mm2, 2); + + mm0 = _mm_packs_pu16(mm0, mm2); + + _mm_store_si64((__m64 *)&outptr[0], mm0); + } + } +} diff --git a/media/libjpeg/simd/loongson/jcsample.h b/media/libjpeg/simd/loongson/jcsample.h new file mode 100644 index 0000000000..2ac48167fc --- /dev/null +++ b/media/libjpeg/simd/loongson/jcsample.h @@ -0,0 +1,28 @@ +/* + * jcsample.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + */ + +LOCAL(void) +expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols, + JDIMENSION output_cols) +{ + register JSAMPROW ptr; + register JSAMPLE pixval; + register int count; + int row; + int numcols = (int)(output_cols - input_cols); + + if (numcols > 0) { + for (row = 0; row < num_rows; row++) { + ptr = image_data[row] + input_cols; + pixval = ptr[-1]; /* don't need GETJSAMPLE() here */ + for (count = numcols; count > 0; count--) + *ptr++ = pixval; + } + } +} diff --git a/media/libjpeg/simd/loongson/jdcolext-mmi.c b/media/libjpeg/simd/loongson/jdcolext-mmi.c new file mode 100644 index 0000000000..560d9b0227 --- /dev/null +++ b/media/libjpeg/simd/loongson/jdcolext-mmi.c @@ -0,0 +1,424 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen <zhuchen@loongson.cn> + * SunZhangzhi <sunzhangzhi-cq@loongson.cn> + * CaiWanwei <caiwanwei@loongson.cn> + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdcolor-mmi.c */ + + +#if RGB_RED == 0 +#define mmA mm0 +#define mmB mm1 +#elif RGB_GREEN == 0 +#define mmA mm2 +#define mmB mm3 +#elif RGB_BLUE == 0 +#define mmA mm4 +#define mmB mm5 +#else +#define mmA mm6 +#define mmB mm7 +#endif + +#if RGB_RED == 1 +#define mmC mm0 +#define mmD mm1 +#elif RGB_GREEN == 1 +#define mmC mm2 +#define mmD mm3 +#elif RGB_BLUE == 1 +#define mmC mm4 +#define mmD mm5 +#else +#define mmC mm6 +#define mmD mm7 +#endif + +#if RGB_RED == 2 +#define mmE mm0 +#define mmF mm1 +#elif RGB_GREEN == 2 +#define mmE mm2 +#define mmF mm3 +#elif RGB_BLUE == 2 +#define mmE mm4 +#define mmF mm5 +#else +#define mmE mm6 +#define mmF mm7 +#endif + +#if RGB_RED == 3 +#define mmG mm0 +#define mmH mm1 +#elif RGB_GREEN == 3 +#define mmG mm2 +#define mmH mm3 +#elif RGB_BLUE == 3 +#define mmG mm4 +#define mmH mm5 +#else +#define mmG mm6 +#define mmH mm7 +#endif + + +void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + JSAMPROW outptr, inptr0, inptr1, inptr2; + int num_cols, col; + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; + __m64 mm8, wk[2]; + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + + for (num_cols = out_width; num_cols > 0; num_cols -= 8, + inptr0 += 8, inptr1 += 8, inptr2 += 8) { + + mm5 = _mm_load_si64((__m64 *)inptr1); + mm1 = _mm_load_si64((__m64 *)inptr2); + mm8 = _mm_load_si64((__m64 *)inptr0); + mm4 = 0; + mm7 = 0; + mm4 = _mm_cmpeq_pi16(mm4, mm4); + mm7 = _mm_cmpeq_pi16(mm7, mm7); + mm4 = _mm_srli_pi16(mm4, BYTE_BIT); + mm7 = _mm_slli_pi16(mm7, 7); /* mm7={0xFF80 0xFF80 0xFF80 0xFF80} */ + mm0 = mm4; /* mm0=mm4={0xFF 0x00 0xFF 0x00 ..} */ + + mm4 = _mm_and_si64(mm4, mm5); /* mm4=Cb(0246)=CbE */ + mm5 = _mm_srli_pi16(mm5, BYTE_BIT); /* mm5=Cb(1357)=CbO */ + mm0 = _mm_and_si64(mm0, mm1); /* mm0=Cr(0246)=CrE */ + mm1 = _mm_srli_pi16(mm1, BYTE_BIT); /* mm1=Cr(1357)=CrO */ + mm4 = _mm_add_pi16(mm4, mm7); + mm5 = _mm_add_pi16(mm5, mm7); + mm0 = _mm_add_pi16(mm0, mm7); + mm1 = _mm_add_pi16(mm1, mm7); + + /* (Original) + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb + * + * (This implementation) + * R = Y + 0.40200 * Cr + Cr + * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + * B = Y - 0.22800 * Cb + Cb + Cb + */ + + mm2 = mm4; /* mm2 = CbE */ + mm3 = mm5; /* mm3 = CbO */ + mm4 = _mm_add_pi16(mm4, mm4); /* mm4 = 2*CbE */ + mm5 = _mm_add_pi16(mm5, mm5); /* mm5 = 2*CbO */ + mm6 = mm0; /* mm6 = CrE */ + mm7 = mm1; /* mm7 = CrO */ + mm0 = _mm_add_pi16(mm0, mm0); /* mm0 = 2*CrE */ + mm1 = _mm_add_pi16(mm1, mm1); /* mm1 = 2*CrO */ + + mm4 = _mm_mulhi_pi16(mm4, PW_MF0228); /* mm4=(2*CbE * -FIX(0.22800) */ + mm5 = _mm_mulhi_pi16(mm5, PW_MF0228); /* mm5=(2*CbO * -FIX(0.22800) */ + mm0 = _mm_mulhi_pi16(mm0, PW_F0402); /* mm0=(2*CrE * FIX(0.40200)) */ + mm1 = _mm_mulhi_pi16(mm1, PW_F0402); /* mm1=(2*CrO * FIX(0.40200)) */ + + mm4 = _mm_add_pi16(mm4, PW_ONE); + mm5 = _mm_add_pi16(mm5, PW_ONE); + mm4 = _mm_srai_pi16(mm4, 1); /* mm4=(CbE * -FIX(0.22800)) */ + mm5 = _mm_srai_pi16(mm5, 1); /* mm5=(CbO * -FIX(0.22800)) */ + mm0 = _mm_add_pi16(mm0, PW_ONE); + mm1 = _mm_add_pi16(mm1, PW_ONE); + mm0 = _mm_srai_pi16(mm0, 1); /* mm0=(CrE * FIX(0.40200)) */ + mm1 = _mm_srai_pi16(mm1, 1); /* mm1=(CrO * FIX(0.40200)) */ + + mm4 = _mm_add_pi16(mm4, mm2); + mm5 = _mm_add_pi16(mm5, mm3); + mm4 = _mm_add_pi16(mm4, mm2); /* mm4=(CbE * FIX(1.77200))=(B-Y)E */ + mm5 = _mm_add_pi16(mm5, mm3); /* mm5=(CbO * FIX(1.77200))=(B-Y)O */ + mm0 = _mm_add_pi16(mm0, mm6); /* mm0=(CrE * FIX(1.40200))=(R-Y)E */ + mm1 = _mm_add_pi16(mm1, mm7); /* mm1=(CrO * FIX(1.40200))=(R-Y)O */ + + wk[0] = mm4; /* wk(0)=(B-Y)E */ + wk[1] = mm5; /* wk(1)=(B-Y)O */ + + mm4 = mm2; + mm5 = mm3; + mm2 = _mm_unpacklo_pi16(mm2, mm6); + mm4 = _mm_unpackhi_pi16(mm4, mm6); + mm2 = _mm_madd_pi16(mm2, PW_MF0344_F0285); + mm4 = _mm_madd_pi16(mm4, PW_MF0344_F0285); + mm3 = _mm_unpacklo_pi16(mm3, mm7); + mm5 = _mm_unpackhi_pi16(mm5, mm7); + mm3 = _mm_madd_pi16(mm3, PW_MF0344_F0285); + mm5 = _mm_madd_pi16(mm5, PW_MF0344_F0285); + + mm2 = _mm_add_pi32(mm2, PD_ONEHALF); + mm4 = _mm_add_pi32(mm4, PD_ONEHALF); + mm2 = _mm_srai_pi32(mm2, SCALEBITS); + mm4 = _mm_srai_pi32(mm4, SCALEBITS); + mm3 = _mm_add_pi32(mm3, PD_ONEHALF); + mm5 = _mm_add_pi32(mm5, PD_ONEHALF); + mm3 = _mm_srai_pi32(mm3, SCALEBITS); + mm5 = _mm_srai_pi32(mm5, SCALEBITS); + + mm2 = _mm_packs_pi32(mm2, mm4); /* mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) */ + mm3 = _mm_packs_pi32(mm3, mm5); /* mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) */ + mm2 = _mm_sub_pi16(mm2, mm6); /* mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */ + mm3 = _mm_sub_pi16(mm3, mm7); /* mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */ + + mm5 = mm8; /* mm5=Y(01234567) */ + + mm4 = _mm_cmpeq_pi16(mm4, mm4); + mm4 = _mm_srli_pi16(mm4, BYTE_BIT); /* mm4={0xFF 0x00 0xFF 0x00 ..} */ + mm4 = _mm_and_si64(mm4, mm5); /* mm4=Y(0246)=YE */ + mm5 = _mm_srli_pi16(mm5, BYTE_BIT); /* mm5=Y(1357)=YO */ + + mm0 = _mm_add_pi16(mm0, mm4); /* mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) */ + mm1 = _mm_add_pi16(mm1, mm5); /* mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) */ + mm0 = _mm_packs_pu16(mm0, mm0); /* mm0=(R0 R2 R4 R6 ** ** ** **) */ + mm1 = _mm_packs_pu16(mm1, mm1); /* mm1=(R1 R3 R5 R7 ** ** ** **) */ + + mm2 = _mm_add_pi16(mm2, mm4); /* mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) */ + mm3 = _mm_add_pi16(mm3, mm5); /* mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) */ + mm2 = _mm_packs_pu16(mm2, mm2); /* mm2=(G0 G2 G4 G6 ** ** ** **) */ + mm3 = _mm_packs_pu16(mm3, mm3); /* mm3=(G1 G3 G5 G7 ** ** ** **) */ + + mm4 = _mm_add_pi16(mm4, wk[0]); /* mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) */ + mm5 = _mm_add_pi16(mm5, wk[1]); /* mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) */ + mm4 = _mm_packs_pu16(mm4, mm4); /* mm4=(B0 B2 B4 B6 ** ** ** **) */ + mm5 = _mm_packs_pu16(mm5, mm5); /* mm5=(B1 B3 B5 B7 ** ** ** **) */ + +#if RGB_PIXELSIZE == 3 + + /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */ + /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */ + mmA = _mm_unpacklo_pi8(mmA, mmC); /* mmA=(00 10 02 12 04 14 06 16) */ + mmE = _mm_unpacklo_pi8(mmE, mmB); /* mmE=(20 01 22 03 24 05 26 07) */ + mmD = _mm_unpacklo_pi8(mmD, mmF); /* mmD=(11 21 13 23 15 25 17 27) */ + + mmG = mmA; + mmH = mmA; + mmA = _mm_unpacklo_pi16(mmA, mmE); /* mmA=(00 10 20 01 02 12 22 03) */ + mmG = _mm_unpackhi_pi16(mmG, mmE); /* mmG=(04 14 24 05 06 16 26 07) */ + + mmH = _mm_srli_si64(mmH, 2 * BYTE_BIT); + mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT); + + mmC = mmD; + mmB = mmD; + mmD = _mm_unpacklo_pi16(mmD, mmH); /* mmD=(11 21 02 12 13 23 04 14) */ + mmC = _mm_unpackhi_pi16(mmC, mmH); /* mmC=(15 25 06 16 17 27 -- --) */ + + mmB = _mm_srli_si64(mmB, 2 * BYTE_BIT); /* mmB=(13 23 15 25 17 27 -- --) */ + + mmF = mmE; + mmE = _mm_unpacklo_pi16(mmE, mmB); /* mmE=(22 03 13 23 24 05 15 25) */ + mmF = _mm_unpackhi_pi16(mmF, mmB); /* mmF=(26 07 17 27 -- -- -- --) */ + + mmA = _mm_unpacklo_pi32(mmA, mmD); /* mmA=(00 10 20 01 11 21 02 12) */ + mmE = _mm_unpacklo_pi32(mmE, mmG); /* mmE=(22 03 13 23 04 14 24 05) */ + mmC = _mm_unpacklo_pi32(mmC, mmF); /* mmC=(15 25 06 16 26 07 17 27) */ + + if (num_cols >= 8) { + _mm_store_si64((__m64 *)outptr, mmA); + _mm_store_si64((__m64 *)(outptr + 8), mmE); + _mm_store_si64((__m64 *)(outptr + 16), mmC); + outptr += RGB_PIXELSIZE * 8; + } else { + col = num_cols * 3; + asm(".set noreorder\r\n" + + "li $8, 16\r\n" + "move $9, %4\r\n" + "mov.s $f4, %1\r\n" + "mov.s $f6, %3\r\n" + "move $10, %5\r\n" + "bltu $9, $8, 1f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, 0($10)\r\n" + "gssdlc1 $f6, 7+8($10)\r\n" + "gssdrc1 $f6, 8($10)\r\n" + "mov.s $f4, %2\r\n" + "subu $9, $9, 16\r\n" + "daddu $10, $10, 16\r\n" + "b 2f\r\n" + "nop \r\n" + + "1: \r\n" + "li $8, 8\r\n" /* st8 */ + "bltu $9, $8, 2f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, ($10)\r\n" + "mov.s $f4, %3\r\n" + "subu $9, $9, 8\r\n" + "daddu $10, $10, 8\r\n" + + "2: \r\n" + "li $8, 4\r\n" /* st4 */ + "mfc1 $11, $f4\r\n" + "bltu $9, $8, 3f\r\n" + "nop \r\n" + "swl $11, 3($10)\r\n" + "swr $11, 0($10)\r\n" + "li $8, 32\r\n" + "mtc1 $8, $f6\r\n" + "dsrl $f4, $f4, $f6\r\n" + "mfc1 $11, $f4\r\n" + "subu $9, $9, 4\r\n" + "daddu $10, $10, 4\r\n" + + "3: \r\n" + "li $8, 2\r\n" /* st2 */ + "bltu $9, $8, 4f\r\n" + "nop \r\n" + "ush $11, 0($10)\r\n" + "srl $11, 16\r\n" + "subu $9, $9, 2\r\n" + "daddu $10, $10, 2\r\n" + + "4: \r\n" + "li $8, 1\r\n" /* st1 */ + "bltu $9, $8, 5f\r\n" + "nop \r\n" + "sb $11, 0($10)\r\n" + + "5: \r\n" + "nop \r\n" /* end */ + : "=m" (*outptr) + : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr) + : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory" + ); + } + +#else /* RGB_PIXELSIZE == 4 */ + +#ifdef RGBX_FILLER_0XFF + mm6 = _mm_cmpeq_pi8(mm6, mm6); + mm7 = _mm_cmpeq_pi8(mm7, mm7); +#else + mm6 = _mm_xor_si64(mm6, mm6); + mm7 = _mm_xor_si64(mm7, mm7); +#endif + /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */ + /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */ + /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */ + /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */ + + mmA = _mm_unpacklo_pi8(mmA, mmC); /* mmA=(00 10 02 12 04 14 06 16) */ + mmE = _mm_unpacklo_pi8(mmE, mmG); /* mmE=(20 30 22 32 24 34 26 36) */ + mmB = _mm_unpacklo_pi8(mmB, mmD); /* mmB=(01 11 03 13 05 15 07 17) */ + mmF = _mm_unpacklo_pi8(mmF, mmH); /* mmF=(21 31 23 33 25 35 27 37) */ + + mmC = mmA; + mmA = _mm_unpacklo_pi16(mmA, mmE); /* mmA=(00 10 20 30 02 12 22 32) */ + mmC = _mm_unpackhi_pi16(mmC, mmE); /* mmC=(04 14 24 34 06 16 26 36) */ + mmG = mmB; + mmB = _mm_unpacklo_pi16(mmB, mmF); /* mmB=(01 11 21 31 03 13 23 33) */ + mmG = _mm_unpackhi_pi16(mmG, mmF); /* mmG=(05 15 25 35 07 17 27 37) */ + + mmD = mmA; + mmA = _mm_unpacklo_pi32(mmA, mmB); /* mmA=(00 10 20 30 01 11 21 31) */ + mmD = _mm_unpackhi_pi32(mmD, mmB); /* mmD=(02 12 22 32 03 13 23 33) */ + mmH = mmC; + mmC = _mm_unpacklo_pi32(mmC, mmG); /* mmC=(04 14 24 34 05 15 25 35) */ + mmH = _mm_unpackhi_pi32(mmH, mmG); /* mmH=(06 16 26 36 07 17 27 37) */ + + if (num_cols >= 8) { + _mm_store_si64((__m64 *)outptr, mmA); + _mm_store_si64((__m64 *)(outptr + 8), mmD); + _mm_store_si64((__m64 *)(outptr + 16), mmC); + _mm_store_si64((__m64 *)(outptr + 24), mmH); + outptr += RGB_PIXELSIZE * 8; + } else { + col = num_cols; + asm(".set noreorder\r\n" /* st16 */ + + "li $8, 4\r\n" + "move $9, %6\r\n" + "move $10, %7\r\n" + "mov.s $f4, %2\r\n" + "mov.s $f6, %4\r\n" + "bltu $9, $8, 1f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, ($10)\r\n" + "gssdlc1 $f6, 7+8($10)\r\n" + "gssdrc1 $f6, 8($10)\r\n" + "mov.s $f4, %3\r\n" + "mov.s $f6, %5\r\n" + "subu $9, $9, 4\r\n" + "daddu $10, $10, 16\r\n" + + "1: \r\n" + "li $8, 2\r\n" /* st8 */ + "bltu $9, $8, 2f\r\n" + "nop \r\n" + "gssdlc1 $f4, 7($10)\r\n" + "gssdrc1 $f4, 0($10)\r\n" + "mov.s $f4, $f6\r\n" + "subu $9, $9, 2\r\n" + "daddu $10, $10, 8\r\n" + + "2: \r\n" + "li $8, 1\r\n" /* st4 */ + "bltu $9, $8, 3f\r\n" + "nop \r\n" + "gsswlc1 $f4, 3($10)\r\n" + "gsswrc1 $f4, 0($10)\r\n" + + "3: \r\n" + "li %1, 0\r\n" /* end */ + : "=m" (*outptr), "=r" (col) + : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col), + "r" (outptr) + : "$f4", "$f6", "$8", "$9", "$10", "memory" + ); + } + +#endif + + } + } +} + +#undef mmA +#undef mmB +#undef mmC +#undef mmD +#undef mmE +#undef mmF +#undef mmG +#undef mmH diff --git a/media/libjpeg/simd/loongson/jdcolor-mmi.c b/media/libjpeg/simd/loongson/jdcolor-mmi.c new file mode 100644 index 0000000000..2c58263dbd --- /dev/null +++ b/media/libjpeg/simd/loongson/jdcolor-mmi.c @@ -0,0 +1,139 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2011, 2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen <zhuchen@loongson.cn> + * CaiWanwei <caiwanwei@loongson.cn> + * SunZhangzhi <sunzhangzhi-cq@loongson.cn> + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* YCC --> RGB CONVERSION */ + +#include "jsimd_mmi.h" + + +#define F_0_344 ((short)22554) /* FIX(0.34414) */ +#define F_0_402 ((short)26345) /* FIX(1.40200) - FIX(1) */ +#define F_0_285 ((short)18734) /* FIX(1) - FIX(0.71414) */ +#define F_0_228 ((short)14942) /* FIX(2) - FIX(1.77200) */ + +enum const_index { + index_PW_ONE, + index_PW_F0402, + index_PW_MF0228, + index_PW_MF0344_F0285, + index_PD_ONEHALF +}; + +static uint64_t const_value[] = { + _uint64_set_pi16(1, 1, 1, 1), + _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402), + _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228), + _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344), + _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))) +}; + +#define PW_ONE get_const_value(index_PW_ONE) +#define PW_F0402 get_const_value(index_PW_F0402) +#define PW_MF0228 get_const_value(index_PW_MF0228) +#define PW_MF0344_F0285 get_const_value(index_PW_MF0344_F0285) +#define PD_ONEHALF get_const_value(index_PD_ONEHALF) + +#define RGBX_FILLER_0XFF 1 + + +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgb_convert_mmi +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_mmi + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgbx_convert_mmi +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_mmi + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgr_convert_mmi +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_mmi + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgrx_convert_mmi +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_mmi + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxbgr_convert_mmi +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_mmi + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxrgb_convert_mmi +#include "jdcolext-mmi.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_mmi diff --git a/media/libjpeg/simd/loongson/jdsample-mmi.c b/media/libjpeg/simd/loongson/jdsample-mmi.c new file mode 100644 index 0000000000..00a6265176 --- /dev/null +++ b/media/libjpeg/simd/loongson/jdsample-mmi.c @@ -0,0 +1,245 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen <zhuchen@loongson.cn> + * CaiWanwei <caiwanwei@loongson.cn> + * SunZhangzhi <sunzhangzhi-cq@loongson.cn> + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* CHROMA UPSAMPLING */ + +#include "jsimd_mmi.h" + + +enum const_index { + index_PW_THREE, + index_PW_SEVEN, + index_PW_EIGHT, +}; + +static uint64_t const_value[] = { + _uint64_set_pi16(3, 3, 3, 3), + _uint64_set_pi16(7, 7, 7, 7), + _uint64_set_pi16(8, 8, 8, 8), +}; + +#define PW_THREE get_const_value(index_PW_THREE) +#define PW_SEVEN get_const_value(index_PW_SEVEN) +#define PW_EIGHT get_const_value(index_PW_EIGHT) + + +#define PROCESS_ROW(r) { \ + mm7 = _mm_load_si64((__m64 *)outptr##r); /* mm7=IntrL=( 0 1 2 3) */ \ + mm3 = _mm_load_si64((__m64 *)outptr##r + 1); /* mm3=IntrH=( 4 5 6 7) */ \ + \ + mm0 = mm7; \ + mm4 = mm3; \ + mm0 = _mm_srli_si64(mm0, 2 * BYTE_BIT); /* mm0=( 1 2 3 -) */ \ + mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( - - - 4) */ \ + mm5 = mm7; \ + mm6 = mm3; \ + mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm5=( 3 - - -) */ \ + mm6 = _mm_slli_si64(mm6, 2 * BYTE_BIT); /* mm6=( - 4 5 6) */ \ + \ + mm0 = _mm_or_si64(mm0, mm4); /* mm0=( 1 2 3 4) */ \ + mm5 = _mm_or_si64(mm5, mm6); /* mm5=( 3 4 5 6) */ \ + \ + mm1 = mm7; \ + mm2 = mm3; \ + mm1 = _mm_slli_si64(mm1, 2 * BYTE_BIT); /* mm1=( - 0 1 2) */ \ + mm2 = _mm_srli_si64(mm2, 2 * BYTE_BIT); /* mm2=( 5 6 7 -) */ \ + mm4 = mm3; \ + mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( 7 - - -) */ \ + \ + mm1 = _mm_or_si64(mm1, wk[r]); /* mm1=(-1 0 1 2) */ \ + mm2 = _mm_or_si64(mm2, wk[r + 2]); /* mm2=( 5 6 6 8) */ \ + \ + wk[r] = mm4; \ + \ + mm7 = _mm_mullo_pi16(mm7, PW_THREE); \ + mm3 = _mm_mullo_pi16(mm3, PW_THREE); \ + mm1 = _mm_add_pi16(mm1, PW_EIGHT); \ + mm5 = _mm_add_pi16(mm5, PW_EIGHT); \ + mm0 = _mm_add_pi16(mm0, PW_SEVEN); \ + mm2 = _mm_add_pi16(mm2, PW_SEVEN); \ + \ + mm1 = _mm_add_pi16(mm1, mm7); \ + mm5 = _mm_add_pi16(mm5, mm3); \ + mm1 = _mm_srli_pi16(mm1, 4); /* mm1=OutrLE=( 0 2 4 6) */ \ + mm5 = _mm_srli_pi16(mm5, 4); /* mm5=OutrHE=( 8 10 12 14) */ \ + mm0 = _mm_add_pi16(mm0, mm7); \ + mm2 = _mm_add_pi16(mm2, mm3); \ + mm0 = _mm_srli_pi16(mm0, 4); /* mm0=OutrLO=( 1 3 5 7) */ \ + mm2 = _mm_srli_pi16(mm2, 4); /* mm2=OutrHO=( 9 11 13 15) */ \ + \ + mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \ + mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \ + mm1 = _mm_or_si64(mm1, mm0); /* mm1=OutrL=( 0 1 2 3 4 5 6 7) */ \ + mm5 = _mm_or_si64(mm5, mm2); /* mm5=OutrH=( 8 9 10 11 12 13 14 15) */ \ + \ + _mm_store_si64((__m64 *)outptr##r, mm1); \ + _mm_store_si64((__m64 *)outptr##r + 1, mm5); \ +} + +void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; + int inrow, outrow, incol, tmp, tmp1; + __m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0; + __m64 wk[4], mm_tmp; + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + + inptr_1 = input_data[inrow - 1]; + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow + 1]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + if (downsampled_width & 7) { + tmp = (downsampled_width - 1) * sizeof(JSAMPLE); + tmp1 = downsampled_width * sizeof(JSAMPLE); + asm("daddu $8, %3, %6\r\n" + "lb $9, ($8)\r\n" + "daddu $8, %3, %7\r\n" + "sb $9, ($8)\r\n" + "daddu $8, %4, %6\r\n" + "lb $9, ($8)\r\n" + "daddu $8, %4, %7\r\n" + "sb $9, ($8)\r\n" + "daddu $8, %5, %6\r\n" + "lb $9, ($8)\r\n" + "daddu $8, %5, %7\r\n" + "sb $9, ($8)\r\n" + : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1) + : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1) + : "$8", "$9" + ); + } + + /* process the first column block */ + mm0 = _mm_load_si64((__m64 *)inptr0); /* mm0 = row[ 0][0] */ + mm1 = _mm_load_si64((__m64 *)inptr_1); /* mm1 = row[-1][0] */ + mm2 = _mm_load_si64((__m64 *)inptr1); /* mm2 = row[ 1][0] */ + + mm3 = _mm_xor_si64(mm3, mm3); /* mm3 = (all 0's) */ + mm4 = mm0; + mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][0]( 0 1 2 3) */ + mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][0]( 4 5 6 7) */ + mm5 = mm1; + mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][0]( 0 1 2 3) */ + mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][0]( 4 5 6 7) */ + mm6 = mm2; + mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][0]( 0 1 2 3) */ + mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][0]( 4 5 6 7) */ + + mm0 = _mm_mullo_pi16(mm0, PW_THREE); + mm4 = _mm_mullo_pi16(mm4, PW_THREE); + + mm7 = _mm_cmpeq_pi8(mm7, mm7); + mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD - 2) * BYTE_BIT); + + mm1 = _mm_add_pi16(mm1, mm0); /* mm1=Int0L=( 0 1 2 3) */ + mm5 = _mm_add_pi16(mm5, mm4); /* mm5=Int0H=( 4 5 6 7) */ + mm2 = _mm_add_pi16(mm2, mm0); /* mm2=Int1L=( 0 1 2 3) */ + mm6 = _mm_add_pi16(mm6, mm4); /* mm6=Int1H=( 4 5 6 7) */ + + _mm_store_si64((__m64 *)outptr0, mm1); /* temporarily save */ + _mm_store_si64((__m64 *)outptr0 + 1, mm5); /* the intermediate data */ + _mm_store_si64((__m64 *)outptr1, mm2); + _mm_store_si64((__m64 *)outptr1 + 1, mm6); + + mm1 = _mm_and_si64(mm1, mm7); /* mm1=( 0 - - -) */ + mm2 = _mm_and_si64(mm2, mm7); /* mm2=( 0 - - -) */ + + wk[0] = mm1; + wk[1] = mm2; + + for (incol = downsampled_width; incol > 0; + incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8, + outptr0 += 16, outptr1 += 16) { + + if (incol > 8) { + /* process the next column block */ + mm0 = _mm_load_si64((__m64 *)inptr0 + 1); /* mm0 = row[ 0][1] */ + mm1 = _mm_load_si64((__m64 *)inptr_1 + 1); /* mm1 = row[-1][1] */ + mm2 = _mm_load_si64((__m64 *)inptr1 + 1); /* mm2 = row[+1][1] */ + + mm3 = _mm_setzero_si64(); /* mm3 = (all 0's) */ + mm4 = mm0; + mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][1]( 0 1 2 3) */ + mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][1]( 4 5 6 7) */ + mm5 = mm1; + mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][1]( 0 1 2 3) */ + mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][1]( 4 5 6 7) */ + mm6 = mm2; + mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][1]( 0 1 2 3) */ + mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][1]( 4 5 6 7) */ + + mm0 = _mm_mullo_pi16(mm0, PW_THREE); + mm4 = _mm_mullo_pi16(mm4, PW_THREE); + + mm1 = _mm_add_pi16(mm1, mm0); /* mm1 = Int0L = ( 0 1 2 3) */ + mm5 = _mm_add_pi16(mm5, mm4); /* mm5 = Int0H = ( 4 5 6 7) */ + mm2 = _mm_add_pi16(mm2, mm0); /* mm2 = Int1L = ( 0 1 2 3) */ + mm6 = _mm_add_pi16(mm6, mm4); /* mm6 = Int1H = ( 4 5 6 7) */ + + _mm_store_si64((__m64 *)outptr0 + 2, mm1); /* temporarily save */ + _mm_store_si64((__m64 *)outptr0 + 3, mm5); /* the intermediate data */ + _mm_store_si64((__m64 *)outptr1 + 2, mm2); + _mm_store_si64((__m64 *)outptr1 + 3, mm6); + + mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm1=( - - - 0) */ + mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm2=( - - - 0) */ + + wk[2] = mm1; + wk[3] = mm2; + } else { + /* process the last column block */ + mm1 = _mm_cmpeq_pi8(mm1, mm1); + mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); + mm2 = mm1; + + mm_tmp = _mm_load_si64((__m64 *)outptr0 + 1); + mm1 = _mm_and_si64(mm1, mm_tmp); /* mm1=( - - - 7) */ + mm_tmp = _mm_load_si64((__m64 *)outptr1 + 1); + mm2 = _mm_and_si64(mm2, mm_tmp); /* mm2=( - - - 7) */ + + wk[2] = mm1; + wk[3] = mm2; + } + + /* process the upper row */ + PROCESS_ROW(0) + + /* process the lower row */ + PROCESS_ROW(1) + } + } +} diff --git a/media/libjpeg/simd/loongson/jfdctint-mmi.c b/media/libjpeg/simd/loongson/jfdctint-mmi.c new file mode 100644 index 0000000000..7f4dfe9123 --- /dev/null +++ b/media/libjpeg/simd/loongson/jfdctint-mmi.c @@ -0,0 +1,398 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2014, 2018, 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen <zhuchen@loongson.cn> + * CaiWanwei <caiwanwei@loongson.cn> + * SunZhangzhi <sunzhangzhi-cq@loongson.cn> + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* ACCURATE INTEGER FORWARD DCT */ + +#include "jsimd_mmi.h" + + +#define CONST_BITS 13 +#define PASS1_BITS 2 +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */ +#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */ +#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */ +#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */ +#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */ +#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */ +#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */ +#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */ +#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */ +#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */ +#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */ +#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */ + +enum const_index { + index_PW_F130_F054, + index_PW_F054_MF130, + index_PW_MF078_F117, + index_PW_F117_F078, + index_PW_MF060_MF089, + index_PW_MF089_F060, + index_PW_MF050_MF256, + index_PW_MF256_F050, + index_PD_DESCALE_P1, + index_PD_DESCALE_P2, + index_PW_DESCALE_P2X +}; + +static uint64_t const_value[] = { + _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765), + FIX_0_541, (FIX_0_541 + FIX_0_765)), + _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541, + (FIX_0_541 - FIX_1_847), FIX_0_541), + _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961), + FIX_1_175, (FIX_1_175 - FIX_1_961)), + _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175, + (FIX_1_175 - FIX_0_390), FIX_1_175), + _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899), + -FIX_0_899, (FIX_0_298 - FIX_0_899)), + _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899, + (FIX_1_501 - FIX_0_899), -FIX_0_899), + _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562), + -FIX_2_562, (FIX_2_053 - FIX_2_562)), + _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562, + (FIX_3_072 - FIX_2_562), -FIX_2_562), + _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))), + _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))), + _uint64_set_pi16((1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)), + (1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1))) +}; + +#define PW_F130_F054 get_const_value(index_PW_F130_F054) +#define PW_F054_MF130 get_const_value(index_PW_F054_MF130) +#define PW_MF078_F117 get_const_value(index_PW_MF078_F117) +#define PW_F117_F078 get_const_value(index_PW_F117_F078) +#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089) +#define PW_MF089_F060 get_const_value(index_PW_MF089_F060) +#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256) +#define PW_MF256_F050 get_const_value(index_PW_MF256_F050) +#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1) +#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2) +#define PW_DESCALE_P2X get_const_value(index_PW_DESCALE_P2X) + + +#define DO_FDCT_COMMON(PASS) { \ + __m64 tmp1312l, tmp1312h, tmp47l, tmp47h, tmp4l, tmp4h, tmp7l, tmp7h; \ + __m64 tmp56l, tmp56h, tmp5l, tmp5h, tmp6l, tmp6h; \ + __m64 out1l, out1h, out2l, out2h, out3l, out3h; \ + __m64 out5l, out5h, out6l, out6h, out7l, out7h; \ + __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \ + \ + /* (Original) \ + * z1 = (tmp12 + tmp13) * 0.541196100; \ + * out2 = z1 + tmp13 * 0.765366865; \ + * out6 = z1 + tmp12 * -1.847759065; \ + * \ + * (This implementation) \ + * out2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \ + * out6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \ + */ \ + \ + tmp1312l = _mm_unpacklo_pi16(tmp13, tmp12); \ + tmp1312h = _mm_unpackhi_pi16(tmp13, tmp12); \ + \ + out2l = _mm_madd_pi16(tmp1312l, PW_F130_F054); \ + out2h = _mm_madd_pi16(tmp1312h, PW_F130_F054); \ + out6l = _mm_madd_pi16(tmp1312l, PW_F054_MF130); \ + out6h = _mm_madd_pi16(tmp1312h, PW_F054_MF130); \ + \ + out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \ + out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \ + out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \ + out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \ + \ + out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \ + out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \ + out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \ + out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \ + \ + out2 = _mm_packs_pi32(out2l, out2h); \ + out6 = _mm_packs_pi32(out6l, out6h); \ + \ + /* Odd part */ \ + \ + z3 = _mm_add_pi16(tmp4, tmp6); \ + z4 = _mm_add_pi16(tmp5, tmp7); \ + \ + /* (Original) \ + * z5 = (z3 + z4) * 1.175875602; \ + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ + * z3 += z5; z4 += z5; \ + * \ + * (This implementation) \ + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ + */ \ + \ + z34l = _mm_unpacklo_pi16(z3, z4); \ + z34h = _mm_unpackhi_pi16(z3, z4); \ + z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \ + z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \ + z4l = _mm_madd_pi16(z34l, PW_F117_F078); \ + z4h = _mm_madd_pi16(z34h, PW_F117_F078); \ + \ + /* (Original) \ + * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \ + * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \ + * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \ + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ + * out7 = tmp4 + z1 + z3; out5 = tmp5 + z2 + z4; \ + * out3 = tmp6 + z2 + z3; out1 = tmp7 + z1 + z4; \ + * \ + * (This implementation) \ + * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \ + * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \ + * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \ + * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \ + * out7 = tmp4 + z3; out5 = tmp5 + z4; \ + * out3 = tmp6 + z3; out1 = tmp7 + z4; \ + */ \ + \ + tmp47l = _mm_unpacklo_pi16(tmp4, tmp7); \ + tmp47h = _mm_unpackhi_pi16(tmp4, tmp7); \ + \ + tmp4l = _mm_madd_pi16(tmp47l, PW_MF060_MF089); \ + tmp4h = _mm_madd_pi16(tmp47h, PW_MF060_MF089); \ + tmp7l = _mm_madd_pi16(tmp47l, PW_MF089_F060); \ + tmp7h = _mm_madd_pi16(tmp47h, PW_MF089_F060); \ + \ + out7l = _mm_add_pi32(tmp4l, z3l); \ + out7h = _mm_add_pi32(tmp4h, z3h); \ + out1l = _mm_add_pi32(tmp7l, z4l); \ + out1h = _mm_add_pi32(tmp7h, z4h); \ + \ + out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \ + out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \ + out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \ + out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \ + \ + out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \ + out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \ + out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \ + out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \ + \ + out7 = _mm_packs_pi32(out7l, out7h); \ + out1 = _mm_packs_pi32(out1l, out1h); \ + \ + tmp56l = _mm_unpacklo_pi16(tmp5, tmp6); \ + tmp56h = _mm_unpackhi_pi16(tmp5, tmp6); \ + \ + tmp5l = _mm_madd_pi16(tmp56l, PW_MF050_MF256); \ + tmp5h = _mm_madd_pi16(tmp56h, PW_MF050_MF256); \ + tmp6l = _mm_madd_pi16(tmp56l, PW_MF256_F050); \ + tmp6h = _mm_madd_pi16(tmp56h, PW_MF256_F050); \ + \ + out5l = _mm_add_pi32(tmp5l, z4l); \ + out5h = _mm_add_pi32(tmp5h, z4h); \ + out3l = _mm_add_pi32(tmp6l, z3l); \ + out3h = _mm_add_pi32(tmp6h, z3h); \ + \ + out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \ + out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \ + out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \ + out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \ + \ + out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \ + out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \ + out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \ + out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \ + \ + out5 = _mm_packs_pi32(out5l, out5h); \ + out3 = _mm_packs_pi32(out3l, out3h); \ +} + +#define DO_FDCT_PASS1() { \ + __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \ + __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \ + __m64 col0, col1, col2, col3, col4, col5, col6, col7; \ + __m64 tmp10, tmp11; \ + \ + row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ + row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \ + row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ + row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \ + row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ + row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \ + row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ + row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \ + \ + /* Transpose coefficients */ \ + \ + row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \ + row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \ + row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \ + row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \ + \ + row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \ + row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \ + row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \ + row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \ + \ + col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \ + col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \ + col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \ + col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \ + \ + tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \ + tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \ + tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \ + tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \ + \ + col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \ + col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \ + col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \ + col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \ + \ + tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \ + tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \ + tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \ + tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \ + \ + /* Even part */ \ + \ + tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \ + tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \ + tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \ + tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \ + \ + out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \ + out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \ + out0 = _mm_slli_pi16(out0, PASS1_BITS); \ + out4 = _mm_slli_pi16(out4, PASS1_BITS); \ + \ + DO_FDCT_COMMON(1) \ + \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \ +} + +#define DO_FDCT_PASS2() { \ + __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \ + __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \ + __m64 row0, row1, row2, row3, row4, row5, row6, row7; \ + __m64 tmp10, tmp11; \ + \ + col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 10 20 30) */ \ + col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (01 11 21 31) */ \ + col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (02 12 22 32) */ \ + col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (03 13 23 33) */ \ + col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]); /* (40 50 60 70) */ \ + col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]); /* (41 51 61 71) */ \ + col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]); /* (42 52 62 72) */ \ + col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]); /* (43 53 63 73) */ \ + \ + /* Transpose coefficients */ \ + \ + col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \ + col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \ + col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \ + col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \ + \ + col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \ + col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \ + col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \ + col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \ + \ + row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \ + row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \ + row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \ + row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \ + \ + tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \ + tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \ + tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \ + tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \ + \ + row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \ + row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \ + row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \ + row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \ + \ + tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \ + tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \ + tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \ + tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \ + \ + /* Even part */ \ + \ + tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \ + tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \ + tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \ + tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \ + \ + out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \ + out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \ + \ + out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \ + out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \ + out0 = _mm_srai_pi16(out0, PASS1_BITS); \ + out4 = _mm_srai_pi16(out4, PASS1_BITS); \ + \ + DO_FDCT_COMMON(2) \ + \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \ + _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \ +} + +void jsimd_fdct_islow_mmi(DCTELEM *data) +{ + __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m64 out0, out1, out2, out3, out4, out5, out6, out7; + __m64 tmp12, tmp13; + DCTELEM *dataptr = data; + + /* Pass 1: process rows. */ + + DO_FDCT_PASS1() + dataptr += DCTSIZE * 4; + DO_FDCT_PASS1() + + /* Pass 2: process columns. */ + + dataptr = data; + DO_FDCT_PASS2() + dataptr += 4; + DO_FDCT_PASS2() +} diff --git a/media/libjpeg/simd/loongson/jidctint-mmi.c b/media/libjpeg/simd/loongson/jidctint-mmi.c new file mode 100644 index 0000000000..cd3db980c5 --- /dev/null +++ b/media/libjpeg/simd/loongson/jidctint-mmi.c @@ -0,0 +1,571 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen <zhuchen@loongson.cn> + * CaiWanwei <caiwanwei@loongson.cn> + * SunZhangzhi <sunzhangzhi-cq@loongson.cn> + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* ACCUATE INTEGER INVERSE DCT */ + +#include "jsimd_mmi.h" + + +#define CONST_BITS 13 +#define PASS1_BITS 2 +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) +#define CENTERJSAMPLE 128 + +#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */ +#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */ +#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */ +#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */ +#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */ +#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */ +#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */ +#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */ +#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */ +#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */ +#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */ +#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */ + +enum const_index { + index_PW_F130_F054, + index_PW_F054_MF130, + index_PW_MF078_F117, + index_PW_F117_F078, + index_PW_MF060_MF089, + index_PW_MF089_F060, + index_PW_MF050_MF256, + index_PW_MF256_F050, + index_PD_DESCALE_P1, + index_PD_DESCALE_P2, + index_PB_CENTERJSAMP +}; + +static uint64_t const_value[] = { + _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765), + FIX_0_541, (FIX_0_541 + FIX_0_765)), + _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541, + (FIX_0_541 - FIX_1_847), FIX_0_541), + _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961), + FIX_1_175, (FIX_1_175 - FIX_1_961)), + _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175, + (FIX_1_175 - FIX_0_390), FIX_1_175), + _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899), + -FIX_0_899, (FIX_0_298 - FIX_0_899)), + _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899, + (FIX_1_501 - FIX_0_899), -FIX_0_899), + _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562), + -FIX_2_562, (FIX_2_053 - FIX_2_562)), + _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562, + (FIX_3_072 - FIX_2_562), -FIX_2_562), + _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))), + _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))), + _uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, + CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE) +}; + +#define PW_F130_F054 get_const_value(index_PW_F130_F054) +#define PW_F054_MF130 get_const_value(index_PW_F054_MF130) +#define PW_MF078_F117 get_const_value(index_PW_MF078_F117) +#define PW_F117_F078 get_const_value(index_PW_F117_F078) +#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089) +#define PW_MF089_F060 get_const_value(index_PW_MF089_F060) +#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256) +#define PW_MF256_F050 get_const_value(index_PW_MF256_F050) +#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1) +#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2) +#define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP) + + +#define test_m32_zero(mm32) (!(*(uint32_t *)&mm32)) +#define test_m64_zero(mm64) (!(*(uint64_t *)&mm64)) + + +#define DO_IDCT_COMMON(PASS) { \ + __m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \ + __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \ + __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \ + __m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \ + __m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \ + \ + z3 = _mm_add_pi16(tmp0, tmp2); \ + z4 = _mm_add_pi16(tmp1, tmp3); \ + \ + /* (Original) \ + * z5 = (z3 + z4) * 1.175875602; \ + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ + * z3 += z5; z4 += z5; \ + * \ + * (This implementation) \ + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ + */ \ + \ + z34l = _mm_unpacklo_pi16(z3, z4); \ + z34h = _mm_unpackhi_pi16(z3, z4); \ + z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \ + z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \ + z4l = _mm_madd_pi16(z34l, PW_F117_F078); \ + z4h = _mm_madd_pi16(z34h, PW_F117_F078); \ + \ + /* (Original) \ + * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \ + * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \ + * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \ + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ + * tmp0 += z1 + z3; tmp1 += z2 + z4; \ + * tmp2 += z2 + z3; tmp3 += z1 + z4; \ + * \ + * (This implementation) \ + * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \ + * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \ + * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \ + * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \ + * tmp0 += z3; tmp1 += z4; \ + * tmp2 += z3; tmp3 += z4; \ + */ \ + \ + tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \ + tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \ + \ + tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \ + tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \ + tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \ + tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \ + \ + tmp0l = _mm_add_pi32(tmp0l, z3l); \ + tmp0h = _mm_add_pi32(tmp0h, z3h); \ + tmp3l = _mm_add_pi32(tmp3l, z4l); \ + tmp3h = _mm_add_pi32(tmp3h, z4h); \ + \ + tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \ + tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \ + \ + tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \ + tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \ + tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \ + tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \ + \ + tmp1l = _mm_add_pi32(tmp1l, z4l); \ + tmp1h = _mm_add_pi32(tmp1h, z4h); \ + tmp2l = _mm_add_pi32(tmp2l, z3l); \ + tmp2h = _mm_add_pi32(tmp2h, z3h); \ + \ + /* Final output stage */ \ + \ + out0l = _mm_add_pi32(tmp10l, tmp3l); \ + out0h = _mm_add_pi32(tmp10h, tmp3h); \ + out7l = _mm_sub_pi32(tmp10l, tmp3l); \ + out7h = _mm_sub_pi32(tmp10h, tmp3h); \ + \ + out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \ + out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \ + out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \ + out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \ + \ + out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \ + out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \ + out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \ + out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \ + \ + out0 = _mm_packs_pi32(out0l, out0h); \ + out7 = _mm_packs_pi32(out7l, out7h); \ + \ + out1l = _mm_add_pi32(tmp11l, tmp2l); \ + out1h = _mm_add_pi32(tmp11h, tmp2h); \ + out6l = _mm_sub_pi32(tmp11l, tmp2l); \ + out6h = _mm_sub_pi32(tmp11h, tmp2h); \ + \ + out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \ + out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \ + out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \ + out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \ + \ + out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \ + out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \ + out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \ + out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \ + \ + out1 = _mm_packs_pi32(out1l, out1h); \ + out6 = _mm_packs_pi32(out6l, out6h); \ + \ + out2l = _mm_add_pi32(tmp12l, tmp1l); \ + out2h = _mm_add_pi32(tmp12h, tmp1h); \ + out5l = _mm_sub_pi32(tmp12l, tmp1l); \ + out5h = _mm_sub_pi32(tmp12h, tmp1h); \ + \ + out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \ + out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \ + out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \ + out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \ + \ + out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \ + out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \ + out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \ + out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \ + \ + out2 = _mm_packs_pi32(out2l, out2h); \ + out5 = _mm_packs_pi32(out5l, out5h); \ + \ + out3l = _mm_add_pi32(tmp13l, tmp0l); \ + out3h = _mm_add_pi32(tmp13h, tmp0h); \ + \ + out4l = _mm_sub_pi32(tmp13l, tmp0l); \ + out4h = _mm_sub_pi32(tmp13h, tmp0h); \ + \ + out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \ + out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \ + out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \ + out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \ + \ + out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \ + out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \ + out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \ + out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \ + \ + out3 = _mm_packs_pi32(out3l, out3h); \ + out4 = _mm_packs_pi32(out4l, out4h); \ +} + +#define DO_IDCT_PASS1(iter) { \ + __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \ + __m64 quant0l, quant1l, quant2l, quant3l; \ + __m64 quant4l, quant5l, quant6l, quant7l; \ + __m64 z23, z2, z3, z23l, z23h; \ + __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \ + __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \ + __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \ + __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \ + __m32 col0a, col1a, mm0; \ + \ + col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \ + col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \ + mm0 = _mm_or_si32(col0a, col1a); \ + \ + if (test_m32_zero(mm0)) { \ + __m64 mm1, mm2; \ + \ + col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \ + col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \ + col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \ + col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \ + col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \ + col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \ + col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \ + col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \ + \ + mm1 = _mm_or_si64(col1l, col3l); \ + mm2 = _mm_or_si64(col2l, col4l); \ + mm1 = _mm_or_si64(mm1, col5l); \ + mm2 = _mm_or_si64(mm2, col6l); \ + mm1 = _mm_or_si64(mm1, col7l); \ + mm1 = _mm_or_si64(mm1, mm2); \ + \ + if (test_m64_zero(mm1)) { \ + __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \ + \ + /* AC terms all zero */ \ + \ + quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \ + \ + dcval = _mm_mullo_pi16(col0l, quant0l); \ + dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval=(00 10 20 30) */ \ + \ + dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \ + dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \ + \ + row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \ + row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \ + row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \ + row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \ + \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \ + \ + goto nextcolumn##iter; \ + } \ + } \ + \ + /* Even part \ + * \ + * (Original) \ + * z1 = (z2 + z3) * 0.541196100; \ + * tmp2 = z1 + z3 * -1.847759065; \ + * tmp3 = z1 + z2 * 0.765366865; \ + * \ + * (This implementation) \ + * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ + * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ + */ \ + \ + col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); /* (00 10 20 30) */ \ + col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); /* (02 12 22 32) */ \ + col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); /* (04 14 24 34) */ \ + col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); /* (06 16 26 36) */ \ + \ + quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \ + quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \ + quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \ + quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \ + \ + z2 = _mm_mullo_pi16(col2l, quant2l); \ + z3 = _mm_mullo_pi16(col6l, quant6l); \ + \ + z23l = _mm_unpacklo_pi16(z2, z3); \ + z23h = _mm_unpackhi_pi16(z2, z3); \ + tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \ + tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \ + tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \ + tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \ + \ + z2 = _mm_mullo_pi16(col0l, quant0l); \ + z3 = _mm_mullo_pi16(col4l, quant4l); \ + \ + z23 = _mm_add_pi16(z2, z3); \ + tmp0l = _mm_loadlo_pi16_f(z23); \ + tmp0h = _mm_loadhi_pi16_f(z23); \ + tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \ + tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \ + \ + tmp10l = _mm_add_pi32(tmp0l, tmp3l); \ + tmp10h = _mm_add_pi32(tmp0h, tmp3h); \ + tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \ + tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \ + \ + z23 = _mm_sub_pi16(z2, z3); \ + tmp1l = _mm_loadlo_pi16_f(z23); \ + tmp1h = _mm_loadhi_pi16_f(z23); \ + tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \ + tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \ + \ + tmp11l = _mm_add_pi32(tmp1l, tmp2l); \ + tmp11h = _mm_add_pi32(tmp1h, tmp2h); \ + tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \ + tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \ + \ + /* Odd part */ \ + \ + col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); /* (01 11 21 31) */ \ + col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); /* (03 13 23 33) */ \ + col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); /* (05 15 25 35) */ \ + col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); /* (07 17 27 37) */ \ + \ + quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \ + quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \ + quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \ + quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \ + \ + tmp0 = _mm_mullo_pi16(col7l, quant7l); \ + tmp1 = _mm_mullo_pi16(col5l, quant5l); \ + tmp2 = _mm_mullo_pi16(col3l, quant3l); \ + tmp3 = _mm_mullo_pi16(col1l, quant1l); \ + \ + DO_IDCT_COMMON(1) \ + \ + /* out0=(00 10 20 30), out1=(01 11 21 31) */ \ + /* out2=(02 12 22 32), out3=(03 13 23 33) */ \ + /* out4=(04 14 24 34), out5=(05 15 25 35) */ \ + /* out6=(06 16 26 36), out7=(07 17 27 37) */ \ + \ + /* Transpose coefficients */ \ + \ + row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \ + row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \ + row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \ + row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \ + \ + row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \ + row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \ + row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \ + row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \ + \ + row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \ + row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \ + row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \ + row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \ + \ + row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \ + row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \ + row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \ + row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \ + \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \ + _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \ +} + +#define DO_IDCT_PASS2(ctr) { \ + __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \ + __m64 z23, z23l, z23h; \ + __m64 col0123a, col0123b, col0123c, col0123d; \ + __m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \ + __m64 col0, col1, col2, col3; \ + __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \ + __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \ + \ + row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ + row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ + row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ + row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ + row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]); /* (40 41 42 43) */ \ + row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]); /* (50 51 52 53) */ \ + row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]); /* (60 61 62 63) */ \ + row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]); /* (70 71 72 73) */ \ + \ + /* Even part \ + * \ + * (Original) \ + * z1 = (z2 + z3) * 0.541196100; \ + * tmp2 = z1 + z3 * -1.847759065; \ + * tmp3 = z1 + z2 * 0.765366865; \ + * \ + * (This implementation) \ + * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ + * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ + */ \ + \ + z23l = _mm_unpacklo_pi16(row2l, row6l); \ + z23h = _mm_unpackhi_pi16(row2l, row6l); \ + \ + tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \ + tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \ + tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \ + tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \ + \ + z23 = _mm_add_pi16(row0l, row4l); \ + tmp0l = _mm_loadlo_pi16_f(z23); \ + tmp0h = _mm_loadhi_pi16_f(z23); \ + tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \ + tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \ + \ + tmp10l = _mm_add_pi32(tmp0l, tmp3l); \ + tmp10h = _mm_add_pi32(tmp0h, tmp3h); \ + tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \ + tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \ + \ + z23 = _mm_sub_pi16(row0l, row4l); \ + tmp1l = _mm_loadlo_pi16_f(z23); \ + tmp1h = _mm_loadhi_pi16_f(z23); \ + tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \ + tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \ + \ + tmp11l = _mm_add_pi32(tmp1l, tmp2l); \ + tmp11h = _mm_add_pi32(tmp1h, tmp2h); \ + tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \ + tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \ + \ + /* Odd part */ \ + \ + tmp0 = row7l; \ + tmp1 = row5l; \ + tmp2 = row3l; \ + tmp3 = row1l; \ + \ + DO_IDCT_COMMON(2) \ + \ + /* out0=(00 01 02 03), out1=(10 11 12 13) */ \ + /* out2=(20 21 22 23), out3=(30 31 32 33) */ \ + /* out4=(40 41 42 43), out5=(50 51 52 53) */ \ + /* out6=(60 61 62 63), out7=(70 71 72 73) */ \ + \ + row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \ + row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \ + row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \ + row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \ + \ + row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \ + row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \ + row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \ + row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \ + \ + /* Transpose coefficients */ \ + \ + col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \ + col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \ + col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \ + col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \ + \ + col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \ + col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \ + col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \ + col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \ + \ + col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \ + col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \ + col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \ + col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \ + \ + _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \ + _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \ + _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \ + _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \ +} + +void jsimd_idct_islow_mmi(void *dct_table, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + __m64 tmp0, tmp1, tmp2, tmp3; + __m64 out0, out1, out2, out3, out4, out5, out6, out7; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + JCOEF *wsptr; + JCOEF workspace[DCTSIZE2]; /* buffers data between passes */ + + /* Pass 1: process columns. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *)dct_table; + wsptr = workspace; + + DO_IDCT_PASS1(1) +nextcolumn1: + inptr += 4; + quantptr += 4; + wsptr += DCTSIZE * 4; + DO_IDCT_PASS1(2) +nextcolumn2: + + /* Pass 2: process rows. */ + + wsptr = workspace; + + DO_IDCT_PASS2(0) + wsptr += 4; + DO_IDCT_PASS2(4) +} diff --git a/media/libjpeg/simd/loongson/jquanti-mmi.c b/media/libjpeg/simd/loongson/jquanti-mmi.c new file mode 100644 index 0000000000..f9a3f81996 --- /dev/null +++ b/media/libjpeg/simd/loongson/jquanti-mmi.c @@ -0,0 +1,130 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen <zhuchen@loongson.cn> + * CaiWanwei <caiwanwei@loongson.cn> + * SunZhangzhi <sunzhangzhi-cq@loongson.cn> + * Copyright (C) 2018, D. R. Commander. All Rights Reserved. + * + * Based on the x86 SIMD extension for IJG JPEG library + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ + +#include "jsimd_mmi.h" + + +#define DO_QUANT() { \ + mm2 = _mm_load_si64((__m64 *)&workspace[0]); \ + mm3 = _mm_load_si64((__m64 *)&workspace[4]); \ + \ + mm0 = mm2; \ + mm1 = mm3; \ + \ + mm2 = _mm_srai_pi16(mm2, (WORD_BIT - 1)); /* -1 if value < 0, */ \ + /* 0 otherwise */ \ + mm3 = _mm_srai_pi16(mm3, (WORD_BIT - 1)); \ + \ + mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \ + mm1 = _mm_xor_si64(mm1, mm3); \ + mm0 = _mm_sub_pi16(mm0, mm2); \ + mm1 = _mm_sub_pi16(mm1, mm3); \ + \ + corr0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]); /* correction */ \ + corr1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \ + \ + mm0 = _mm_add_pi16(mm0, corr0); /* correction + roundfactor */ \ + mm1 = _mm_add_pi16(mm1, corr1); \ + \ + mm4 = mm0; \ + mm5 = mm1; \ + \ + recip0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]); /* reciprocal */ \ + recip1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \ + \ + mm0 = _mm_mulhi_pi16(mm0, recip0); \ + mm1 = _mm_mulhi_pi16(mm1, recip1); \ + \ + mm0 = _mm_add_pi16(mm0, mm4); /* reciprocal is always negative */ \ + mm1 = _mm_add_pi16(mm1, mm5); /* (MSB=1), so we always need to add the */ \ + /* initial value (input value is never */ \ + /* negative as we inverted it at the */ \ + /* start of this routine) */ \ + \ + scale0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]); /* scale */ \ + scale1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \ + \ + mm6 = scale0; \ + mm7 = scale1; \ + mm4 = mm0; \ + mm5 = mm1; \ + \ + mm0 = _mm_mulhi_pi16(mm0, mm6); \ + mm1 = _mm_mulhi_pi16(mm1, mm7); \ + \ + mm6 = _mm_srai_pi16(mm6, (WORD_BIT - 1)); /* determine if scale... */ \ + /* is negative */ \ + mm7 = _mm_srai_pi16(mm7, (WORD_BIT - 1)); \ + \ + mm6 = _mm_and_si64(mm6, mm4); /* and add input if it is */ \ + mm7 = _mm_and_si64(mm7, mm5); \ + mm0 = _mm_add_pi16(mm0, mm6); \ + mm1 = _mm_add_pi16(mm1, mm7); \ + \ + mm4 = _mm_srai_pi16(mm4, (WORD_BIT - 1)); /* then check if... */ \ + mm5 = _mm_srai_pi16(mm5, (WORD_BIT - 1)); /* negative input */ \ + \ + mm4 = _mm_and_si64(mm4, scale0); /* and add scale if it is */ \ + mm5 = _mm_and_si64(mm5, scale1); \ + mm0 = _mm_add_pi16(mm0, mm4); \ + mm1 = _mm_add_pi16(mm1, mm5); \ + \ + mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \ + mm1 = _mm_xor_si64(mm1, mm3); \ + mm0 = _mm_sub_pi16(mm0, mm2); \ + mm1 = _mm_sub_pi16(mm1, mm3); \ + \ + _mm_store_si64((__m64 *)&output_ptr[0], mm0); \ + _mm_store_si64((__m64 *)&output_ptr[4], mm1); \ + \ + workspace += DCTSIZE; \ + divisors += DCTSIZE; \ + output_ptr += DCTSIZE; \ +} + + +void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + JCOEFPTR output_ptr = coef_block; + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; + __m64 corr0, corr1, recip0, recip1, scale0, scale1; + + DO_QUANT() + DO_QUANT() + DO_QUANT() + DO_QUANT() + DO_QUANT() + DO_QUANT() + DO_QUANT() + DO_QUANT() +} diff --git a/media/libjpeg/simd/loongson/jsimd.c b/media/libjpeg/simd/loongson/jsimd.c new file mode 100644 index 0000000000..e8b1832253 --- /dev/null +++ b/media/libjpeg/simd/loongson/jsimd.c @@ -0,0 +1,610 @@ +/* + * jsimd_loongson.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander. + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * Copyright (C) 2015, 2018, Matthieu Darbois. + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * Loongson architecture. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +static unsigned int simd_support = ~0; + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char *env = NULL; +#endif + + if (simd_support != ~0U) + return; + + simd_support |= JSIMD_MMI; + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_c_can_null_convert(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + mmifct = jsimd_extrgb_ycc_convert_mmi; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mmifct = jsimd_extrgbx_ycc_convert_mmi; + break; + case JCS_EXT_BGR: + mmifct = jsimd_extbgr_ycc_convert_mmi; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mmifct = jsimd_extbgrx_ycc_convert_mmi; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mmifct = jsimd_extxbgr_ycc_convert_mmi; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mmifct = jsimd_extxrgb_ycc_convert_mmi; + break; + default: + mmifct = jsimd_rgb_ycc_convert_mmi; + break; + } + + mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + mmifct = jsimd_ycc_extrgb_convert_mmi; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mmifct = jsimd_ycc_extrgbx_convert_mmi; + break; + case JCS_EXT_BGR: + mmifct = jsimd_ycc_extbgr_convert_mmi; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mmifct = jsimd_ycc_extbgrx_convert_mmi; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mmifct = jsimd_ycc_extxbgr_convert_mmi; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mmifct = jsimd_ycc_extxrgb_convert_mmi; + break; + default: + mmifct = jsimd_ycc_rgb_convert_mmi; + break; + } + + mmifct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ +} + +GLOBAL(void) +jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v2_smooth_downsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_mmi(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_int_upsample(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_fancy_upsample_mmi(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + jsimd_fdct_islow_mmi(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + jsimd_quantize_mmi(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_6x6(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_12x12(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MMI) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_mmi(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return NULL; +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/media/libjpeg/simd/loongson/jsimd_mmi.h b/media/libjpeg/simd/loongson/jsimd_mmi.h new file mode 100644 index 0000000000..59b2ee0b7a --- /dev/null +++ b/media/libjpeg/simd/loongson/jsimd_mmi.h @@ -0,0 +1,59 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Authors: ZhuChen <zhuchen@loongson.cn> + * CaiWanwei <caiwanwei@loongson.cn> + * SunZhangzhi <sunzhangzhi-cq@loongson.cn> + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jdct.h" +#include "loongson-mmintrin.h" + + +/* Common code */ + +#define SIZEOF_MMWORD 8 +#define BYTE_BIT 8 +#define WORD_BIT 16 +#define SCALEBITS 16 + +#define _uint64_set_pi8(a, b, c, d, e, f, g, h) \ + (((uint64_t)(uint8_t)a << 56) | \ + ((uint64_t)(uint8_t)b << 48) | \ + ((uint64_t)(uint8_t)c << 40) | \ + ((uint64_t)(uint8_t)d << 32) | \ + ((uint64_t)(uint8_t)e << 24) | \ + ((uint64_t)(uint8_t)f << 16) | \ + ((uint64_t)(uint8_t)g << 8) | \ + ((uint64_t)(uint8_t)h)) +#define _uint64_set_pi16(a, b, c, d) \ + (((uint64_t)(uint16_t)a << 48) | \ + ((uint64_t)(uint16_t)b << 32) | \ + ((uint64_t)(uint16_t)c << 16) | \ + ((uint64_t)(uint16_t)d)) +#define _uint64_set_pi32(a, b) \ + (((uint64_t)(uint32_t)a << 32) | \ + ((uint64_t)(uint32_t)b)) + +#define get_const_value(index) (*(__m64 *)&const_value[index]) diff --git a/media/libjpeg/simd/loongson/loongson-mmintrin.h b/media/libjpeg/simd/loongson/loongson-mmintrin.h new file mode 100644 index 0000000000..50d166b753 --- /dev/null +++ b/media/libjpeg/simd/loongson/loongson-mmintrin.h @@ -0,0 +1,1324 @@ +/* + * Loongson MMI optimizations for libjpeg-turbo + * + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. + * All Rights Reserved. + * Copyright (C) 2019, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#ifndef __LOONGSON_MMINTRIN_H__ +#define __LOONGSON_MMINTRIN_H__ + +#include <stdint.h> + + +#define FUNCTION_ATTRIBS \ + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + + +/* Vectors are stored in 64-bit floating-point registers. */ +typedef double __m64; + +/* Having a 32-bit datatype allows us to use 32-bit loads in places like + load8888. */ +typedef float __m32; + + +/********** Set Operations **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_setzero_si64(void) +{ + return 0.0; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4, + uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0) +{ + __m64 ret; + uint32_t lo = ((uint32_t)__b6 << 24) | + ((uint32_t)__b4 << 16) | + ((uint32_t)__b2 << 8) | + (uint32_t)__b0; + uint32_t hi = ((uint32_t)__b7 << 24) | + ((uint32_t)__b5 << 16) | + ((uint32_t)__b3 << 8) | + (uint32_t)__b1; + + asm("mtc1 %1, %0\n\t" + "mtc1 %2, $f0\n\t" + "punpcklbh %0, %0, $f0\n\t" + : "=f" (ret) + : "r" (lo), "r" (hi) + : "$f0" + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0) +{ + __m64 ret; + uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0; + uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1; + + asm("mtc1 %1, %0\n\t" + "mtc1 %2, $f0\n\t" + "punpcklhw %0, %0, $f0\n\t" + : "=f" (ret) + : "r" (lo), "r" (hi) + : "$f0" + ); + + return ret; +} + +#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_set_pi32(uint32_t __i1, uint32_t __i0) +{ + if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) { + uint64_t val = ((uint64_t)__i1 << 32) | + ((uint64_t)__i0 << 0); + + return *(__m64 *)&val; + } else if (__i1 == __i0) { + uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0); + __m64 ret; + + asm("pshufh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm) + ); + + return ret; + } else { + uint64_t val = ((uint64_t)__i1 << 32) | + ((uint64_t)__i0 << 0); + + return *(__m64 *)&val; + } +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_set1_pi8(uint8_t __b0) +{ + __m64 ret; + + asm("sll $8, %1, 8\n\t" + "or %1, %1, $8\n\t" + "mtc1 %1, %0\n\t" + "mtc1 $0, $f0\n\t" + "pshufh %0, %0, $f0\n\t" + : "=f" (ret) + : "r" (__b0) + : "$8", "$f0" + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_set1_pi16(uint16_t __h0) +{ + __m64 ret; + + asm("mtc1 %1, %0\n\t" + "mtc1 $0, $f0\n\t" + "pshufh %0, %0, $f0\n\t" + : "=f" (ret) + : "r" (__h0) + : "$8", "$f0" + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_set1_pi32(unsigned __i0) +{ + return _mm_set_pi32(__i0, __i0); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3, + uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7) +{ + return _mm_set_pi8(__h7, __h6, __h5, __h4, + __h3, __h2, __h1, __h0); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3) +{ + return _mm_set_pi16(__w3, __w2, __w1, __w0); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_setr_pi32(uint32_t __i0, uint32_t __i1) +{ + return _mm_set_pi32(__i1, __i0); +} + + +/********** Arithmetic Operations **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_add_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_add_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_add_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_add_si64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddd %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_adds_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddsb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_adds_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddsh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_adds_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddusb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_adds_pu16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("paddush %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_avg_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pavgb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_avg_pu16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pavgh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_madd_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmaddhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_max_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmaxsh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_max_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmaxub %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_min_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pminsh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_min_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pminub %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline int FUNCTION_ATTRIBS +_mm_movemask_pi8(__m64 __m1) +{ + int ret; + + asm("pmovmskb %0, %1\n\t" + : "=r" (ret) + : "y" (__m1) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_mulhi_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmulhh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_mulhi_pu16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmulhuh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_mullo_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmullh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_mul_pu32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pmuluw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_sad_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psadbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_asub_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pasubub %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_biadd_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("biadd %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_sub_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_sub_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_sub_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_sub_si64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubd %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_subs_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubsb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_subs_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubsh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_subs_pu8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubusb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_subs_pu16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("psubush %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +/********** Logical Operations **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_and_si64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("and %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_andnot_si64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("andn %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_or_si32(__m32 __m1, __m32 __m2) +{ + __m32 ret; + + asm("or %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_or_si64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("or %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_xor_si64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("xor %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +/********** Shift Operations **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_slli_pi16(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("psllh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_slli_pi32(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("psllw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_slli_si64(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("dsll %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_srli_pi16(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("psrlh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_srli_pi32(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("psrlw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_srli_si64(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("dsrl %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_srai_pi16(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("psrah %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_srai_pi32(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("psraw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_srai_si64(__m64 __m, int64_t __count) +{ + __m64 ret; + + asm("dsra %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + + return ret; +} + + +/********** Conversion Intrinsics **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +to_m64(uint64_t x) +{ + return *(__m64 *)&x; +} + +extern __inline uint64_t FUNCTION_ATTRIBS +to_uint64(__m64 x) +{ + return *(uint64_t *)&x; +} + + +/********** Comparison Intrinsics **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpeqb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpeqh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpeqw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpgtb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpgth %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpgtw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmplt_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpltb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmplt_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmplth %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_cmplt_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("pcmpltw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +/********** Miscellaneous Operations **********/ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_packs_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("packsshb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_packs_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("packsswh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_packs_pi32_f(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("packsswh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_packs_pu16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("packushb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_extract_pi16(__m64 __m, int64_t __pos) +{ + __m64 ret; + + asm("pextrh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__pos) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos) +{ + __m64 ret; + + switch (__pos) { + case 0: + + asm("pinsrh_0 %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2), "i" (__pos) + ); + + break; + + case 1: + + asm("pinsrh_1 %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2), "i" (__pos) + ); + + break; + case 2: + + asm("pinsrh_2 %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2), "i" (__pos) + ); + + break; + + case 3: + + asm("pinsrh_3 %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2), "i" (__pos) + ); + + break; + } + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_shuffle_pi16(__m64 __m, int64_t __n) +{ + __m64 ret; + + asm("pshufh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__n) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpckhbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpckhbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpckhhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpckhhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpckhwd %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +/* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype, + which preserves the data. */ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32, + datatype, which allows load8888 to use 32-bit loads. */ + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklwd %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2) +{ + __m64 ret; + + asm("punpcklwd %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + + return ret; +} + +extern __inline void FUNCTION_ATTRIBS +_mm_store_pi32(__m32 *dest, __m64 src) +{ + src = _mm_packs_pu16(src, _mm_setzero_si64()); + + asm("swc1 %1, %0\n\t" + : "=m" (*dest) + : "f" (src) + : "memory" + ); +} + +extern __inline void FUNCTION_ATTRIBS +_mm_store_si64(__m64 *dest, __m64 src) +{ + asm("gssdlc1 %1, 7+%0\n\t" + "gssdrc1 %1, %0\n\t" + : "=m" (*dest) + : "f" (src) + : "memory" + ); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_load_si32(const __m32 *src) +{ + __m32 ret; + + asm("lwc1 %0, %1\n\t" + : "=f" (ret) + : "m" (*src) + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_load_si64(const __m64 *src) +{ + __m64 ret; + + asm("ldc1 %0, %1\n\t" + : "=f" (ret) + : "m" (*src) + : "memory" + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadu_si64(const __m64 *src) +{ + __m64 ret; + + asm("gsldlc1 %0, 7(%1)\n\t" + "gsldrc1 %0, 0(%1)\n\t" + : "=f" (ret) + : "r" (src) + : "memory" + ); + + return ret; +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadlo_pi8(const uint32_t *src) +{ + return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64()); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadlo_pi8_f(__m64 src) +{ + return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64()); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadhi_pi8_f(__m64 src) +{ + return _mm_unpackhi_pi8_f(src, _mm_setzero_si64()); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadlo_pi16(__m64 src) +{ + return _mm_unpacklo_pi16(src, _mm_setzero_si64()); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadlo_pi16_f(__m64 src) +{ + return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadhi_pi16(__m64 src) +{ + return _mm_unpackhi_pi16(src, _mm_setzero_si64()); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_loadhi_pi16_f(__m64 src) +{ + return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_expand_alpha(__m64 pixel) +{ + return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3)); +} + +extern __inline __m64 FUNCTION_ATTRIBS +_mm_expand_alpha_rev(__m64 pixel) +{ + return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0)); +} + +#endif /* __LOONGSON_MMINTRIN_H__ */ diff --git a/media/libjpeg/simd/mips/jsimd.c b/media/libjpeg/simd/mips/jsimd.c new file mode 100644 index 0000000000..d2546eed32 --- /dev/null +++ b/media/libjpeg/simd/mips/jsimd.c @@ -0,0 +1,1147 @@ +/* + * jsimd_mips.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2011, 2014, 2016, 2018, 2020, D. R. Commander. + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * MIPS architecture. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +#include <stdio.h> +#include <string.h> +#include <ctype.h> + +static unsigned int simd_support = ~0; + +#if !(defined(__mips_dsp) && (__mips_dsp_rev >= 2)) && defined(__linux__) + +LOCAL(void) +parse_proc_cpuinfo(const char *search_string) +{ + const char *file_name = "/proc/cpuinfo"; + char cpuinfo_line[256]; + FILE *f = NULL; + + simd_support = 0; + + if ((f = fopen(file_name, "r")) != NULL) { + while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) { + if (strstr(cpuinfo_line, search_string) != NULL) { + fclose(f); + simd_support |= JSIMD_DSPR2; + return; + } + } + fclose(f); + } + /* Did not find string in the proc file, or not Linux ELF. */ +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char *env = NULL; +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__mips_dsp) && (__mips_dsp_rev >= 2) + simd_support |= JSIMD_DSPR2; +#elif defined(__linux__) + /* We still have a chance to use MIPS DSPR2 regardless of globally used + * -mdspr2 options passed to gcc by performing runtime detection via + * /proc/cpuinfo parsing on linux */ + parse_proc_cpuinfo("MIPS 74K"); +#endif + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCEDSPR2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_DSPR2; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; +#endif +} + +static const int mips_idct_ifast_coefs[4] = { + 0x45404540, /* FIX( 1.082392200 / 2) = 17734 = 0x4546 */ + 0x5A805A80, /* FIX( 1.414213562 / 2) = 23170 = 0x5A82 */ + 0x76407640, /* FIX( 1.847759065 / 2) = 30274 = 0x7642 */ + 0xAC60AC60 /* FIX(-2.613125930 / 4) = -21407 = 0xAC61 */ +}; + +/* The following struct is borrowed from jdsample.c */ +typedef void (*upsample1_ptr) (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +typedef struct { + struct jpeg_upsampler pub; + JSAMPARRAY color_buf[MAX_COMPONENTS]; + upsample1_ptr methods[MAX_COMPONENTS]; + int next_row_out; + JDIMENSION rows_to_go; + int rowgroup_height[MAX_COMPONENTS]; + UINT8 h_expand[MAX_COMPONENTS]; + UINT8 v_expand[MAX_COMPONENTS]; +} my_upsampler; + +typedef my_upsampler *my_upsample_ptr; + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_c_can_null_convert(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + dspr2fct = jsimd_extrgb_ycc_convert_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + dspr2fct = jsimd_extrgbx_ycc_convert_dspr2; + break; + case JCS_EXT_BGR: + dspr2fct = jsimd_extbgr_ycc_convert_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + dspr2fct = jsimd_extbgrx_ycc_convert_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + dspr2fct = jsimd_extxbgr_ycc_convert_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + dspr2fct = jsimd_extxrgb_ycc_convert_dspr2; + break; + default: + dspr2fct = jsimd_extrgb_ycc_convert_dspr2; + break; + } + + dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + dspr2fct = jsimd_extrgb_gray_convert_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + dspr2fct = jsimd_extrgbx_gray_convert_dspr2; + break; + case JCS_EXT_BGR: + dspr2fct = jsimd_extbgr_gray_convert_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + dspr2fct = jsimd_extbgrx_gray_convert_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + dspr2fct = jsimd_extxbgr_gray_convert_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + dspr2fct = jsimd_extxrgb_gray_convert_dspr2; + break; + default: + dspr2fct = jsimd_extrgb_gray_convert_dspr2; + break; + } + + dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + dspr2fct = jsimd_ycc_extrgb_convert_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + dspr2fct = jsimd_ycc_extrgbx_convert_dspr2; + break; + case JCS_EXT_BGR: + dspr2fct = jsimd_ycc_extbgr_convert_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + dspr2fct = jsimd_ycc_extbgrx_convert_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + dspr2fct = jsimd_ycc_extxbgr_convert_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + dspr2fct = jsimd_ycc_extxrgb_convert_dspr2; + break; + default: + dspr2fct = jsimd_ycc_extrgb_convert_dspr2; + break; + } + + dspr2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ +} + +GLOBAL(void) +jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + jsimd_c_null_convert_dspr2(cinfo->image_width, input_buf, output_buf, + output_row, num_rows, cinfo->num_components); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + /* FIXME: jsimd_h2v2_downsample_dspr2() fails some of the TJBench tiling + * regression tests, probably because the DSPr2 SIMD implementation predates + * those tests. */ +#if 0 + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v2_smooth_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (DCTSIZE != 8) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + /* FIXME: jsimd_h2v1_downsample_dspr2() fails some of the TJBench tiling + * regression tests, probably because the DSPr2 SIMD implementation predates + * those tests. */ +#if 0 + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_smooth_downsample_dspr2(input_data, output_data, + compptr->v_samp_factor, + cinfo->max_v_samp_factor, + cinfo->smoothing_factor, + compptr->width_in_blocks, + cinfo->image_width); +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_int_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample; + + jsimd_int_upsample_dspr2(upsample->h_expand[compptr->component_index], + upsample->v_expand[compptr->component_index], + input_data, output_data_ptr, cinfo->output_width, + cinfo->max_v_samp_factor); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_fancy_upsample_dspr2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_dspr2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + dspr2fct = jsimd_h2v2_extrgbx_merged_upsample_dspr2; + break; + case JCS_EXT_BGR: + dspr2fct = jsimd_h2v2_extbgr_merged_upsample_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + dspr2fct = jsimd_h2v2_extbgrx_merged_upsample_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + dspr2fct = jsimd_h2v2_extxbgr_merged_upsample_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + dspr2fct = jsimd_h2v2_extxrgb_merged_upsample_dspr2; + break; + default: + dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2; + break; + } + + dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf, + cinfo->sample_range_limit); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + dspr2fct = jsimd_h2v1_extrgbx_merged_upsample_dspr2; + break; + case JCS_EXT_BGR: + dspr2fct = jsimd_h2v1_extbgr_merged_upsample_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + dspr2fct = jsimd_h2v1_extbgrx_merged_upsample_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + dspr2fct = jsimd_h2v1_extxbgr_merged_upsample_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + dspr2fct = jsimd_h2v1_extxrgb_merged_upsample_dspr2; + break; + default: + dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2; + break; + } + + dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf, + cinfo->sample_range_limit); +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + +#ifndef __mips_soft_float + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_dspr2(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +#ifndef __mips_soft_float + jsimd_convsamp_float_dspr2(sample_data, start_col, workspace); +#endif +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + jsimd_fdct_islow_dspr2(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + jsimd_fdct_ifast_dspr2(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + +#ifndef __mips_soft_float + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + jsimd_quantize_dspr2(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +#ifndef __mips_soft_float + jsimd_quantize_float_dspr2(coef_block, divisors, workspace); +#endif +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_6x6(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_12x12(void) +{ + init_simd(); + + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_dspr2(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + int workspace[DCTSIZE * 4]; /* buffers data between passes */ + + jsimd_idct_4x4_dspr2(compptr->dct_table, coef_block, output_buf, output_col, + workspace); +} + +GLOBAL(void) +jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_6x6_dspr2(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + int workspace[96]; + int output[12] = { + (int)(output_buf[0] + output_col), + (int)(output_buf[1] + output_col), + (int)(output_buf[2] + output_col), + (int)(output_buf[3] + output_col), + (int)(output_buf[4] + output_col), + (int)(output_buf[5] + output_col), + (int)(output_buf[6] + output_col), + (int)(output_buf[7] + output_col), + (int)(output_buf[8] + output_col), + (int)(output_buf[9] + output_col), + (int)(output_buf[10] + output_col), + (int)(output_buf[11] + output_col) + }; + + jsimd_idct_12x12_pass1_dspr2(coef_block, compptr->dct_table, workspace); + jsimd_idct_12x12_pass2_dspr2(workspace, output); +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + +#if defined(__MIPSEL__) + if (simd_support & JSIMD_DSPR2) + return 1; +#endif + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + int output[8] = { + (int)(output_buf[0] + output_col), + (int)(output_buf[1] + output_col), + (int)(output_buf[2] + output_col), + (int)(output_buf[3] + output_col), + (int)(output_buf[4] + output_col), + (int)(output_buf[5] + output_col), + (int)(output_buf[6] + output_col), + (int)(output_buf[7] + output_col) + }; + + jsimd_idct_islow_dspr2(coef_block, compptr->dct_table, output, + IDCT_range_limit(cinfo)); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + JCOEFPTR inptr; + IFAST_MULT_TYPE *quantptr; + DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */ + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (IFAST_MULT_TYPE *)compptr->dct_table; + + jsimd_idct_ifast_cols_dspr2(inptr, quantptr, workspace, + mips_idct_ifast_coefs); + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + jsimd_idct_ifast_rows_dspr2(workspace, output_buf, output_col, + mips_idct_ifast_coefs); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return NULL; +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/media/libjpeg/simd/mips/jsimd_dspr2.S b/media/libjpeg/simd/mips/jsimd_dspr2.S new file mode 100644 index 0000000000..c99288a8d1 --- /dev/null +++ b/media/libjpeg/simd/mips/jsimd_dspr2.S @@ -0,0 +1,4543 @@ +/* + * MIPS DSPr2 optimizations for libjpeg-turbo + * + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * All Rights Reserved. + * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com> + * Darko Laus <darko.laus@imgtec.com> + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#include "jsimd_dspr2_asm.h" + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_c_null_convert_dspr2) +/* + * a0 = cinfo->image_width + * a1 = input_buf + * a2 = output_buf + * a3 = output_row + * 16(sp) = num_rows + * 20(sp) = cinfo->num_components + * + * Null conversion for compression + */ + SAVE_REGS_ON_STACK 8, s0, s1 + + lw t9, 24(sp) /* t9 = num_rows */ + lw s0, 28(sp) /* s0 = cinfo->num_components */ + andi t0, a0, 3 /* t0 = cinfo->image_width & 3 */ + beqz t0, 4f /* no residual */ + nop +0: + addiu t9, t9, -1 + bltz t9, 7f + li t1, 0 +1: + sll t3, t1, 2 + lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */ + lw t2, 0(a1) /* t2 = inptr = *input_buf */ + sll t4, a3, 2 + lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */ + addu t2, t2, t1 + addu s1, t5, a0 + addu t6, t5, t0 +2: + lbu t3, 0(t2) + addiu t5, t5, 1 + sb t3, -1(t5) + bne t6, t5, 2b + addu t2, t2, s0 +3: + lbu t3, 0(t2) + addu t4, t2, s0 + addu t7, t4, s0 + addu t8, t7, s0 + addu t2, t8, s0 + lbu t4, 0(t4) + lbu t7, 0(t7) + lbu t8, 0(t8) + addiu t5, t5, 4 + sb t3, -4(t5) + sb t4, -3(t5) + sb t7, -2(t5) + bne s1, t5, 3b + sb t8, -1(t5) + addiu t1, t1, 1 + bne t1, s0, 1b + nop + addiu a1, a1, 4 + bgez t9, 0b + addiu a3, a3, 1 + b 7f + nop +4: + addiu t9, t9, -1 + bltz t9, 7f + li t1, 0 +5: + sll t3, t1, 2 + lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */ + lw t2, 0(a1) /* t2 = inptr = *input_buf */ + sll t4, a3, 2 + lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */ + addu t2, t2, t1 + addu s1, t5, a0 + addu t6, t5, t0 +6: + lbu t3, 0(t2) + addu t4, t2, s0 + addu t7, t4, s0 + addu t8, t7, s0 + addu t2, t8, s0 + lbu t4, 0(t4) + lbu t7, 0(t7) + lbu t8, 0(t8) + addiu t5, t5, 4 + sb t3, -4(t5) + sb t4, -3(t5) + sb t7, -2(t5) + bne s1, t5, 6b + sb t8, -1(t5) + addiu t1, t1, 1 + bne t1, s0, 5b + nop + addiu a1, a1, 4 + bgez t9, 4b + addiu a3, a3, 1 +7: + RESTORE_REGS_FROM_STACK 8, s0, s1 + + j ra + nop + +END(jsimd_c_null_convert_dspr2) + + +/*****************************************************************************/ +/* + * jsimd_extrgb_ycc_convert_dspr2 + * jsimd_extbgr_ycc_convert_dspr2 + * jsimd_extrgbx_ycc_convert_dspr2 + * jsimd_extbgrx_ycc_convert_dspr2 + * jsimd_extxbgr_ycc_convert_dspr2 + * jsimd_extxrgb_ycc_convert_dspr2 + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \ + r_offs, g_offs, b_offs + +.macro DO_RGB_TO_YCC r, g, b, inptr + lbu \r, \r_offs(\inptr) + lbu \g, \g_offs(\inptr) + lbu \b, \b_offs(\inptr) + addiu \inptr, \pixel_size +.endm + +LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2) +/* + * a0 = cinfo->image_width + * a1 = input_buf + * a2 = output_buf + * a3 = output_row + * 16(sp) = num_rows + */ + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw t7, 48(sp) /* t7 = num_rows */ + li s0, 0x4c8b /* FIX(0.29900) */ + li s1, 0x9646 /* FIX(0.58700) */ + li s2, 0x1d2f /* FIX(0.11400) */ + li s3, 0xffffd4cd /* -FIX(0.16874) */ + li s4, 0xffffab33 /* -FIX(0.33126) */ + li s5, 0x8000 /* FIX(0.50000) */ + li s6, 0xffff94d1 /* -FIX(0.41869) */ + li s7, 0xffffeb2f /* -FIX(0.08131) */ + li t8, 0x807fff /* CBCR_OFFSET + ONE_HALF-1 */ + +0: + addiu t7, -1 /* --num_rows */ + lw t6, 0(a1) /* t6 = input_buf[0] */ + lw t0, 0(a2) + lw t1, 4(a2) + lw t2, 8(a2) + sll t3, a3, 2 + lwx t0, t3(t0) /* t0 = output_buf[0][output_row] */ + lwx t1, t3(t1) /* t1 = output_buf[1][output_row] */ + lwx t2, t3(t2) /* t2 = output_buf[2][output_row] */ + + addu t9, t2, a0 /* t9 = end address */ + addiu a3, 1 + +1: + DO_RGB_TO_YCC t3, t4, t5, t6 + + mtlo s5, $ac0 + mtlo t8, $ac1 + mtlo t8, $ac2 + maddu $ac0, s2, t5 + maddu $ac1, s5, t5 + maddu $ac2, s5, t3 + maddu $ac0, s0, t3 + maddu $ac1, s3, t3 + maddu $ac2, s6, t4 + maddu $ac0, s1, t4 + maddu $ac1, s4, t4 + maddu $ac2, s7, t5 + extr.w t3, $ac0, 16 + extr.w t4, $ac1, 16 + extr.w t5, $ac2, 16 + sb t3, 0(t0) + sb t4, 0(t1) + sb t5, 0(t2) + addiu t0, 1 + addiu t2, 1 + bne t2, t9, 1b + addiu t1, 1 + bgtz t7, 0b + addiu a1, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_\colorid\()_ycc_convert_dspr2) + +.purgem DO_RGB_TO_YCC + +.endm + +/*-------------------------------------id -- pix R G B */ +GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 +GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 +GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 +GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 +GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 +GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 + + +/*****************************************************************************/ +/* + * jsimd_ycc_extrgb_convert_dspr2 + * jsimd_ycc_extbgr_convert_dspr2 + * jsimd_ycc_extrgbx_convert_dspr2 + * jsimd_ycc_extbgrx_convert_dspr2 + * jsimd_ycc_extxbgr_convert_dspr2 + * jsimd_ycc_extxrgb_convert_dspr2 + * + * Colorspace conversion YCbCr -> RGB + */ + +.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \ + r_offs, g_offs, b_offs, a_offs + +.macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr + sb \scratch0, \r_offs(\outptr) + sb \scratch1, \g_offs(\outptr) + sb \scratch2, \b_offs(\outptr) +.if (\pixel_size == 4) + li t0, 0xFF + sb t0, \a_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2) +/* + * a0 = cinfo->image_width + * a1 = input_buf + * a2 = input_row + * a3 = output_buf + * 16(sp) = num_rows + */ + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw s1, 48(sp) + li t3, 0x8000 + li t4, 0x166e9 /* FIX(1.40200) */ + li t5, 0x1c5a2 /* FIX(1.77200) */ + li t6, 0xffff492e /* -FIX(0.71414) */ + li t7, 0xffffa7e6 /* -FIX(0.34414) */ + repl.ph t8, 128 + +0: + lw s0, 0(a3) + lw t0, 0(a1) + lw t1, 4(a1) + lw t2, 8(a1) + sll s5, a2, 2 + addiu s1, -1 + lwx s2, s5(t0) + lwx s3, s5(t1) + lwx s4, s5(t2) + addu t9, s2, a0 + addiu a2, 1 + +1: + lbu s7, 0(s4) /* cr */ + lbu s6, 0(s3) /* cb */ + lbu s5, 0(s2) /* y */ + addiu s2, 1 + addiu s4, 1 + addiu s7, -128 + addiu s6, -128 + mul t2, t7, s6 + mul t0, t6, s7 /* Crgtab[cr] */ + sll s7, 15 + mulq_rs.w t1, t4, s7 /* Crrtab[cr] */ + sll s6, 15 + addu t2, t3 /* Cbgtab[cb] */ + addu t2, t0 + + mulq_rs.w t0, t5, s6 /* Cbbtab[cb] */ + sra t2, 16 + addu t1, s5 + addu t2, s5 /* add y */ + ins t2, t1, 16, 16 + subu.ph t2, t2, t8 + addu t0, s5 + shll_s.ph t2, t2, 8 + subu t0, 128 + shra.ph t2, t2, 8 + shll_s.w t0, t0, 24 + addu.ph t2, t2, t8 /* clip & store */ + sra t0, t0, 24 + sra t1, t2, 16 + addiu t0, 128 + + STORE_YCC_TO_RGB t1, t2, t0, s0 + + bne s2, t9, 1b + addiu s3, 1 + bgtz s1, 0b + addiu a3, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_ycc_\colorid\()_convert_dspr2) + +.purgem STORE_YCC_TO_RGB + +.endm + +/*-------------------------------------id -- pix R G B A */ +GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0 +GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0 + + +/*****************************************************************************/ +/* + * jsimd_extrgb_gray_convert_dspr2 + * jsimd_extbgr_gray_convert_dspr2 + * jsimd_extrgbx_gray_convert_dspr2 + * jsimd_extbgrx_gray_convert_dspr2 + * jsimd_extxbgr_gray_convert_dspr2 + * jsimd_extxrgb_gray_convert_dspr2 + * + * Colorspace conversion RGB -> GRAY + */ + +.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \ + r_offs, g_offs, b_offs + +.macro DO_RGB_TO_GRAY r, g, b, inptr + lbu \r, \r_offs(\inptr) + lbu \g, \g_offs(\inptr) + lbu \b, \b_offs(\inptr) + addiu \inptr, \pixel_size +.endm + +LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2) +/* + * a0 = cinfo->image_width + * a1 = input_buf + * a2 = output_buf + * a3 = output_row + * 16(sp) = num_rows + */ + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + li s0, 0x4c8b /* s0 = FIX(0.29900) */ + li s1, 0x9646 /* s1 = FIX(0.58700) */ + li s2, 0x1d2f /* s2 = FIX(0.11400) */ + li s7, 0x8000 /* s7 = FIX(0.50000) */ + lw s6, 48(sp) + andi t7, a0, 3 + +0: + addiu s6, -1 /* s6 = num_rows */ + lw t0, 0(a1) + lw t1, 0(a2) + sll t3, a3, 2 + lwx t1, t3(t1) + addiu a3, 1 + addu t9, t1, a0 + subu t8, t9, t7 + beq t1, t8, 2f + nop + +1: + DO_RGB_TO_GRAY t3, t4, t5, t0 + DO_RGB_TO_GRAY s3, s4, s5, t0 + + mtlo s7, $ac0 + maddu $ac0, s2, t5 + maddu $ac0, s1, t4 + maddu $ac0, s0, t3 + mtlo s7, $ac1 + maddu $ac1, s2, s5 + maddu $ac1, s1, s4 + maddu $ac1, s0, s3 + extr.w t6, $ac0, 16 + + DO_RGB_TO_GRAY t3, t4, t5, t0 + DO_RGB_TO_GRAY s3, s4, s5, t0 + + mtlo s7, $ac0 + maddu $ac0, s2, t5 + maddu $ac0, s1, t4 + extr.w t2, $ac1, 16 + maddu $ac0, s0, t3 + mtlo s7, $ac1 + maddu $ac1, s2, s5 + maddu $ac1, s1, s4 + maddu $ac1, s0, s3 + extr.w t5, $ac0, 16 + sb t6, 0(t1) + sb t2, 1(t1) + extr.w t3, $ac1, 16 + addiu t1, 4 + sb t5, -2(t1) + sb t3, -1(t1) + bne t1, t8, 1b + nop + +2: + beqz t7, 4f + nop + +3: + DO_RGB_TO_GRAY t3, t4, t5, t0 + + mtlo s7, $ac0 + maddu $ac0, s2, t5 + maddu $ac0, s1, t4 + maddu $ac0, s0, t3 + extr.w t6, $ac0, 16 + sb t6, 0(t1) + addiu t1, 1 + bne t1, t9, 3b + nop + +4: + bgtz s6, 0b + addiu a1, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_\colorid\()_gray_convert_dspr2) + +.purgem DO_RGB_TO_GRAY + +.endm + +/*-------------------------------------id -- pix R G B */ +GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 +GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 +GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 +GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 +GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 +GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 + + +/*****************************************************************************/ +/* + * jsimd_h2v2_merged_upsample_dspr2 + * jsimd_h2v2_extrgb_merged_upsample_dspr2 + * jsimd_h2v2_extrgbx_merged_upsample_dspr2 + * jsimd_h2v2_extbgr_merged_upsample_dspr2 + * jsimd_h2v2_extbgrx_merged_upsample_dspr2 + * jsimd_h2v2_extxbgr_merged_upsample_dspr2 + * jsimd_h2v2_extxrgb_merged_upsample_dspr2 + * + * Merged h2v2 upsample routines + */ +.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ + r1_offs, g1_offs, \ + b1_offs, a1_offs, \ + r2_offs, g2_offs, \ + b2_offs, a2_offs + +.macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ + scratch5 outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) + sb \scratch3, \r2_offs(\outptr) + sb \scratch4, \g2_offs(\outptr) + sb \scratch5, \b2_offs(\outptr) +.if (\pixel_size == 8) + li \scratch0, 0xFF + sb \scratch0, \a1_offs(\outptr) + sb \scratch0, \a2_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +.macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) + +.if (\pixel_size == 8) + li t0, 0xFF + sb t0, \a1_offs(\outptr) +.endif +.endm + +LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) +/* + * a0 = cinfo->output_width + * a1 = input_buf + * a2 = in_row_group_ctr + * a3 = output_buf + * 16(sp) = cinfo->sample_range_limit + */ + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + lw t9, 56(sp) /* cinfo->sample_range_limit */ + lw v0, 0(a1) + lw v1, 4(a1) + lw t0, 8(a1) + sll t1, a2, 3 + addiu t2, t1, 4 + sll t3, a2, 2 + lw t4, 0(a3) /* t4 = output_buf[0] */ + lwx t1, t1(v0) /* t1 = input_buf[0][in_row_group_ctr*2] */ + lwx t2, t2(v0) /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */ + lwx t5, t3(v1) /* t5 = input_buf[1][in_row_group_ctr] */ + lwx t6, t3(t0) /* t6 = input_buf[2][in_row_group_ctr] */ + lw t7, 4(a3) /* t7 = output_buf[1] */ + li s1, 0xe6ea + addiu t8, s1, 0x7fff /* t8 = 0x166e9 [FIX(1.40200)] */ + addiu s0, t8, 0x5eb9 /* s0 = 0x1c5a2 [FIX(1.77200)] */ + addiu s1, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */ + xori s2, s1, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */ + srl t3, a0, 1 + blez t3, 2f + addu t0, t5, t3 /* t0 = end address */ + 1: + lbu t3, 0(t5) + lbu s3, 0(t6) + addiu t5, t5, 1 + addiu t3, t3, -128 /* (cb - 128) */ + addiu s3, s3, -128 /* (cr - 128) */ + mult $ac1, s1, t3 + madd $ac1, s2, s3 + sll s3, s3, 15 + sll t3, t3, 15 + mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */ + extr_r.w s5, $ac1, 16 + mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */ + lbu v0, 0(t1) + addiu t6, t6, 1 + addiu t1, t1, 2 + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ + lbu AT, 0(t3) + lbu s7, 0(s3) + lbu ra, 0(v1) + lbu v0, -1(t1) + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + lbu v0, 0(t2) + + STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 + + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ + lbu AT, 0(t3) + lbu s7, 0(s3) + lbu ra, 0(v1) + lbu v0, 1(t2) + addiu t2, t2, 2 + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + + STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7 + + bne t0, t5, 1b + nop +2: + andi t0, a0, 1 + beqz t0, 4f + lbu t3, 0(t5) + lbu s3, 0(t6) + addiu t3, t3, -128 /* (cb - 128) */ + addiu s3, s3, -128 /* (cr - 128) */ + mult $ac1, s1, t3 + madd $ac1, s2, s3 + sll s3, s3, 15 + sll t3, t3, 15 + lbu v0, 0(t1) + extr_r.w s5, $ac1, 16 + mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */ + mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */ + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + lbu v0, 0(t2) + + STORE_H2V2_1_PIXEL t3, s3, v1, t4 + + addu t3, v0, s4 /* y+cred */ + addu s3, v0, s5 /* y+cgreen */ + addu v1, v0, s6 /* y+cblue */ + addu t3, t9, t3 /* y+cred */ + addu s3, t9, s3 /* y+cgreen */ + addu v1, t9, v1 /* y+cblue */ + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + + STORE_H2V2_1_PIXEL t3, s3, v1, t7 +4: + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + j ra + nop + +END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) + +.purgem STORE_H2V2_1_PIXEL +.purgem STORE_H2V2_2_PIXELS +.endm + +/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ +GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 +GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 +GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 +GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 +GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 +GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 + + +/*****************************************************************************/ +/* + * jsimd_h2v1_merged_upsample_dspr2 + * jsimd_h2v1_extrgb_merged_upsample_dspr2 + * jsimd_h2v1_extrgbx_merged_upsample_dspr2 + * jsimd_h2v1_extbgr_merged_upsample_dspr2 + * jsimd_h2v1_extbgrx_merged_upsample_dspr2 + * jsimd_h2v1_extxbgr_merged_upsample_dspr2 + * jsimd_h2v1_extxrgb_merged_upsample_dspr2 + * + * Merged h2v1 upsample routines + */ + +.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ + r1_offs, g1_offs, \ + b1_offs, a1_offs, \ + r2_offs, g2_offs, \ + b2_offs, a2_offs + +.macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ + scratch5 outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) + sb \scratch3, \r2_offs(\outptr) + sb \scratch4, \g2_offs(\outptr) + sb \scratch5, \b2_offs(\outptr) +.if (\pixel_size == 8) + li t0, 0xFF + sb t0, \a1_offs(\outptr) + sb t0, \a2_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +.macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) +.if (\pixel_size == 8) + li t0, 0xFF + sb t0, \a1_offs(\outptr) +.endif +.endm + +LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) +/* + * a0 = cinfo->output_width + * a1 = input_buf + * a2 = in_row_group_ctr + * a3 = output_buf + * 16(sp) = range_limit + */ + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + li t0, 0xe6ea + lw t1, 0(a1) /* t1 = input_buf[0] */ + lw t2, 4(a1) /* t2 = input_buf[1] */ + lw t3, 8(a1) /* t3 = input_buf[2] */ + lw t8, 56(sp) /* t8 = range_limit */ + addiu s1, t0, 0x7fff /* s1 = 0x166e9 [FIX(1.40200)] */ + addiu s2, s1, 0x5eb9 /* s2 = 0x1c5a2 [FIX(1.77200)] */ + addiu s0, t0, 0x9916 /* s0 = 0x8000 */ + addiu s4, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */ + xori s3, s4, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */ + srl t0, a0, 1 + sll t4, a2, 2 + lwx s5, t4(t1) /* s5 = inptr0 */ + lwx s6, t4(t2) /* s6 = inptr1 */ + lwx s7, t4(t3) /* s7 = inptr2 */ + lw t7, 0(a3) /* t7 = outptr */ + blez t0, 2f + addu t9, s6, t0 /* t9 = end address */ +1: + lbu t2, 0(s6) /* t2 = cb */ + lbu t0, 0(s7) /* t0 = cr */ + lbu t1, 0(s5) /* t1 = y */ + addiu t2, t2, -128 /* t2 = cb - 128 */ + addiu t0, t0, -128 /* t0 = cr - 128 */ + mult $ac1, s4, t2 + madd $ac1, s3, t0 + sll t0, t0, 15 + sll t2, t2, 15 + mulq_rs.w t0, s1, t0 /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */ + extr_r.w t5, $ac1, 16 + mulq_rs.w t6, s2, t2 /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */ + addiu s7, s7, 1 + addiu s6, s6, 1 + addu t2, t1, t0 /* t2 = y + cred */ + addu t3, t1, t5 /* t3 = y + cgreen */ + addu t4, t1, t6 /* t4 = y + cblue */ + addu t2, t8, t2 + addu t3, t8, t3 + addu t4, t8, t4 + lbu t1, 1(s5) + lbu v0, 0(t2) + lbu v1, 0(t3) + lbu ra, 0(t4) + addu t2, t1, t0 + addu t3, t1, t5 + addu t4, t1, t6 + addu t2, t8, t2 + addu t3, t8, t3 + addu t4, t8, t4 + lbu t2, 0(t2) + lbu t3, 0(t3) + lbu t4, 0(t4) + + STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7 + + bne t9, s6, 1b + addiu s5, s5, 2 +2: + andi t0, a0, 1 + beqz t0, 4f + nop +3: + lbu t2, 0(s6) + lbu t0, 0(s7) + lbu t1, 0(s5) + addiu t2, t2, -128 /* (cb - 128) */ + addiu t0, t0, -128 /* (cr - 128) */ + mul t3, s4, t2 + mul t4, s3, t0 + sll t0, t0, 15 + sll t2, t2, 15 + mulq_rs.w t0, s1, t0 /* (C1*cr + ONE_HALF)>> SCALEBITS */ + mulq_rs.w t6, s2, t2 /* (C2*cb + ONE_HALF)>> SCALEBITS */ + addu t3, t3, s0 + addu t3, t4, t3 + sra t5, t3, 16 /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */ + addu t2, t1, t0 /* y + cred */ + addu t3, t1, t5 /* y + cgreen */ + addu t4, t1, t6 /* y + cblue */ + addu t2, t8, t2 + addu t3, t8, t3 + addu t4, t8, t4 + lbu t2, 0(t2) + lbu t3, 0(t3) + lbu t4, 0(t4) + + STORE_H2V1_1_PIXEL t2, t3, t4, t7 +4: + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + j ra + nop + +END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) + +.purgem STORE_H2V1_1_PIXEL +.purgem STORE_H2V1_2_PIXELS +.endm + +/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ +GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 +GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 +GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 +GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 +GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 +GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 + + +/*****************************************************************************/ +/* + * jsimd_h2v2_fancy_upsample_dspr2 + * + * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. + */ +LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2) +/* + * a0 = cinfo->max_v_samp_factor + * a1 = downsampled_width + * a2 = input_data + * a3 = output_data_ptr + */ + SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 + + li s4, 0 + lw s2, 0(a3) /* s2 = *output_data_ptr */ +0: + li t9, 2 + lw s1, -4(a2) /* s1 = inptr1 */ + +1: + lw s0, 0(a2) /* s0 = inptr0 */ + lwx s3, s4(s2) + addiu s5, a1, -2 /* s5 = downsampled_width - 2 */ + srl t4, s5, 1 + sll t4, t4, 1 + lbu t0, 0(s0) + lbu t1, 1(s0) + lbu t2, 0(s1) + lbu t3, 1(s1) + addiu s0, 2 + addiu s1, 2 + addu t8, s0, t4 /* t8 = end address */ + andi s5, s5, 1 /* s5 = residual */ + sll t4, t0, 1 + sll t6, t1, 1 + addu t0, t0, t4 /* t0 = (*inptr0++) * 3 */ + addu t1, t1, t6 /* t1 = (*inptr0++) * 3 */ + addu t7, t0, t2 /* t7 = thiscolsum */ + addu t6, t1, t3 /* t5 = nextcolsum */ + sll t0, t7, 2 /* t0 = thiscolsum * 4 */ + subu t1, t0, t7 /* t1 = thiscolsum * 3 */ + shra_r.w t0, t0, 4 + addiu t1, 7 + addu t1, t1, t6 + srl t1, t1, 4 + sb t0, 0(s3) + sb t1, 1(s3) + beq t8, s0, 22f /* skip to final iteration if width == 3 */ + addiu s3, 2 +2: + lh t0, 0(s0) /* t0 = A3|A2 */ + lh t2, 0(s1) /* t2 = B3|B2 */ + addiu s0, 2 + addiu s1, 2 + preceu.ph.qbr t0, t0 /* t0 = 0|A3|0|A2 */ + preceu.ph.qbr t2, t2 /* t2 = 0|B3|0|B2 */ + shll.ph t1, t0, 1 + sll t3, t6, 1 + addu.ph t0, t1, t0 /* t0 = A3*3|A2*3 */ + addu t3, t3, t6 /* t3 = this * 3 */ + addu.ph t0, t0, t2 /* t0 = next2|next1 */ + addu t1, t3, t7 + andi t7, t0, 0xFFFF /* t7 = next1 */ + sll t2, t7, 1 + addu t2, t7, t2 /* t2 = next1*3 */ + addu t4, t2, t6 + srl t6, t0, 16 /* t6 = next2 */ + shra_r.w t1, t1, 4 /* t1 = (this*3 + last + 8) >> 4 */ + addu t0, t3, t7 + addiu t0, 7 + srl t0, t0, 4 /* t0 = (this*3 + next1 + 7) >> 4 */ + shra_r.w t4, t4, 4 /* t3 = (next1*3 + this + 8) >> 4 */ + addu t2, t2, t6 + addiu t2, 7 + srl t2, t2, 4 /* t2 = (next1*3 + next2 + 7) >> 4 */ + sb t1, 0(s3) + sb t0, 1(s3) + sb t4, 2(s3) + sb t2, 3(s3) + bne t8, s0, 2b + addiu s3, 4 +22: + beqz s5, 4f + addu t8, s0, s5 +3: + lbu t0, 0(s0) + lbu t2, 0(s1) + addiu s0, 1 + addiu s1, 1 + sll t3, t6, 1 + sll t1, t0, 1 + addu t1, t0, t1 /* t1 = inptr0 * 3 */ + addu t3, t3, t6 /* t3 = thiscolsum * 3 */ + addu t5, t1, t2 + addu t1, t3, t7 + shra_r.w t1, t1, 4 + addu t0, t3, t5 + addiu t0, 7 + srl t0, t0, 4 + sb t1, 0(s3) + sb t0, 1(s3) + addiu s3, 2 + move t7, t6 + bne t8, s0, 3b + move t6, t5 +4: + sll t0, t6, 2 /* t0 = thiscolsum * 4 */ + subu t1, t0, t6 /* t1 = thiscolsum * 3 */ + addu t1, t1, t7 + addiu s4, 4 + shra_r.w t1, t1, 4 + addiu t0, 7 + srl t0, t0, 4 + sb t1, 0(s3) + sb t0, 1(s3) + addiu t9, -1 + addiu s3, 2 + bnez t9, 1b + lw s1, 4(a2) + srl t0, s4, 2 + subu t0, a0, t0 + bgtz t0, 0b + addiu a2, 4 + + RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 + + j ra + nop +END(jsimd_h2v2_fancy_upsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2) +/* + * a0 = cinfo->max_v_samp_factor + * a1 = downsampled_width + * a2 = input_data + * a3 = output_data_ptr + */ + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + .set at + + beqz a0, 3f + sll t0, a0, 2 + lw s1, 0(a3) + li s3, 0x10001 + addu s0, s1, t0 +0: + addiu t8, a1, -2 + srl t9, t8, 2 + lw t7, 0(a2) + lw s2, 0(s1) + lbu t0, 0(t7) + lbu t1, 1(t7) /* t1 = inptr[1] */ + sll t2, t0, 1 + addu t2, t2, t0 /* t2 = invalue*3 */ + addu t2, t2, t1 + shra_r.w t2, t2, 2 + sb t0, 0(s2) + sb t2, 1(s2) + beqz t9, 11f + addiu s2, 2 +1: + ulw t0, 0(t7) /* t0 = |P3|P2|P1|P0| */ + ulw t1, 1(t7) + ulh t2, 4(t7) /* t2 = |0|0|P5|P4| */ + preceu.ph.qbl t3, t0 /* t3 = |0|P3|0|P2| */ + preceu.ph.qbr t0, t0 /* t0 = |0|P1|0|P0| */ + preceu.ph.qbr t2, t2 /* t2 = |0|P5|0|P4| */ + preceu.ph.qbl t4, t1 /* t4 = |0|P4|0|P3| */ + preceu.ph.qbr t1, t1 /* t1 = |0|P2|0|P1| */ + shll.ph t5, t4, 1 + shll.ph t6, t1, 1 + addu.ph t5, t5, t4 /* t5 = |P4*3|P3*3| */ + addu.ph t6, t6, t1 /* t6 = |P2*3|P1*3| */ + addu.ph t4, t3, s3 + addu.ph t0, t0, s3 + addu.ph t4, t4, t5 + addu.ph t0, t0, t6 + shrl.ph t4, t4, 2 /* t4 = |0|P3|0|P2| */ + shrl.ph t0, t0, 2 /* t0 = |0|P1|0|P0| */ + addu.ph t2, t2, t5 + addu.ph t3, t3, t6 + shra_r.ph t2, t2, 2 /* t2 = |0|P5|0|P4| */ + shra_r.ph t3, t3, 2 /* t3 = |0|P3|0|P2| */ + shll.ph t2, t2, 8 + shll.ph t3, t3, 8 + or t2, t4, t2 + or t3, t3, t0 + addiu t9, -1 + usw t3, 0(s2) + usw t2, 4(s2) + addiu s2, 8 + bgtz t9, 1b + addiu t7, 4 +11: + andi t8, 3 + beqz t8, 22f + addiu t7, 1 + +2: + lbu t0, 0(t7) + addiu t7, 1 + sll t1, t0, 1 + addu t2, t0, t1 /* t2 = invalue */ + lbu t3, -2(t7) + lbu t4, 0(t7) + addiu t3, 1 + addiu t4, 2 + addu t3, t3, t2 + addu t4, t4, t2 + srl t3, 2 + srl t4, 2 + sb t3, 0(s2) + sb t4, 1(s2) + addiu t8, -1 + bgtz t8, 2b + addiu s2, 2 + +22: + lbu t0, 0(t7) + lbu t2, -1(t7) + sll t1, t0, 1 + addu t1, t1, t0 /* t1 = invalue * 3 */ + addu t1, t1, t2 + addiu t1, 1 + srl t1, t1, 2 + sb t1, 0(s2) + sb t0, 1(s2) + addiu s1, 4 + bne s1, s0, 0b + addiu a2, 4 +3: + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop +END(jsimd_h2v1_fancy_upsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_h2v1_downsample_dspr2) +/* + * a0 = cinfo->image_width + * a1 = cinfo->max_v_samp_factor + * a2 = compptr->v_samp_factor + * a3 = compptr->width_in_blocks + * 16(sp) = input_data + * 20(sp) = output_data + */ + .set at + + SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 + + beqz a2, 7f + lw s1, 44(sp) /* s1 = output_data */ + lw s0, 40(sp) /* s0 = input_data */ + srl s2, a0, 2 + andi t9, a0, 2 + srl t7, t9, 1 + addu s2, t7, s2 + sll t0, a3, 3 /* t0 = width_in_blocks*DCT */ + srl t7, t0, 1 + subu s2, t7, s2 +0: + andi t6, a0, 1 /* t6 = temp_index */ + addiu t6, -1 + lw t4, 0(s1) /* t4 = outptr */ + lw t5, 0(s0) /* t5 = inptr0 */ + li s3, 0 /* s3 = bias */ + srl t7, a0, 1 /* t7 = image_width1 */ + srl s4, t7, 2 + andi t8, t7, 3 +1: + ulhu t0, 0(t5) + ulhu t1, 2(t5) + ulhu t2, 4(t5) + ulhu t3, 6(t5) + raddu.w.qb t0, t0 + raddu.w.qb t1, t1 + raddu.w.qb t2, t2 + raddu.w.qb t3, t3 + shra.ph t0, t0, 1 + shra_r.ph t1, t1, 1 + shra.ph t2, t2, 1 + shra_r.ph t3, t3, 1 + sb t0, 0(t4) + sb t1, 1(t4) + sb t2, 2(t4) + sb t3, 3(t4) + addiu s4, -1 + addiu t4, 4 + bgtz s4, 1b + addiu t5, 8 + beqz t8, 3f + addu s4, t4, t8 +2: + ulhu t0, 0(t5) + raddu.w.qb t0, t0 + addqh.w t0, t0, s3 + xori s3, s3, 1 + sb t0, 0(t4) + addiu t4, 1 + bne t4, s4, 2b + addiu t5, 2 +3: + lbux t1, t6(t5) + sll t1, 1 + addqh.w t2, t1, s3 /* t2 = pixval1 */ + xori s3, s3, 1 + addqh.w t3, t1, s3 /* t3 = pixval2 */ + blez s2, 5f + append t3, t2, 8 + addu t5, t4, s2 /* t5 = loop_end2 */ +4: + ush t3, 0(t4) + addiu s2, -1 + bgtz s2, 4b + addiu t4, 2 +5: + beqz t9, 6f + nop + sb t2, 0(t4) +6: + addiu s1, 4 + addiu a2, -1 + bnez a2, 0b + addiu s0, 4 +7: + RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4 + + j ra + nop +END(jsimd_h2v1_downsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_h2v2_downsample_dspr2) +/* + * a0 = cinfo->image_width + * a1 = cinfo->max_v_samp_factor + * a2 = compptr->v_samp_factor + * a3 = compptr->width_in_blocks + * 16(sp) = input_data + * 20(sp) = output_data + */ + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + beqz a2, 8f + lw s1, 52(sp) /* s1 = output_data */ + lw s0, 48(sp) /* s0 = input_data */ + + andi t6, a0, 1 /* t6 = temp_index */ + addiu t6, -1 + srl t7, a0, 1 /* t7 = image_width1 */ + srl s4, t7, 2 + andi t8, t7, 3 + andi t9, a0, 2 + srl s2, a0, 2 + srl t7, t9, 1 + addu s2, t7, s2 + sll t0, a3, 3 /* s2 = width_in_blocks*DCT */ + srl t7, t0, 1 + subu s2, t7, s2 +0: + lw t4, 0(s1) /* t4 = outptr */ + lw t5, 0(s0) /* t5 = inptr0 */ + lw s7, 4(s0) /* s7 = inptr1 */ + li s6, 1 /* s6 = bias */ +2: + ulw t0, 0(t5) /* t0 = |P3|P2|P1|P0| */ + ulw t1, 0(s7) /* t1 = |Q3|Q2|Q1|Q0| */ + ulw t2, 4(t5) + ulw t3, 4(s7) + precrq.ph.w t7, t0, t1 /* t2 = |P3|P2|Q3|Q2| */ + ins t0, t1, 16, 16 /* t0 = |Q1|Q0|P1|P0| */ + raddu.w.qb t1, t7 + raddu.w.qb t0, t0 + shra_r.w t1, t1, 2 + addiu t0, 1 + srl t0, 2 + precrq.ph.w t7, t2, t3 + ins t2, t3, 16, 16 + raddu.w.qb t7, t7 + raddu.w.qb t2, t2 + shra_r.w t7, t7, 2 + addiu t2, 1 + srl t2, 2 + sb t0, 0(t4) + sb t1, 1(t4) + sb t2, 2(t4) + sb t7, 3(t4) + addiu t4, 4 + addiu t5, 8 + addiu s4, s4, -1 + bgtz s4, 2b + addiu s7, 8 + beqz t8, 4f + addu t8, t4, t8 +3: + ulhu t0, 0(t5) + ulhu t1, 0(s7) + ins t0, t1, 16, 16 + raddu.w.qb t0, t0 + addu t0, t0, s6 + srl t0, 2 + xori s6, s6, 3 + sb t0, 0(t4) + addiu t5, 2 + addiu t4, 1 + bne t8, t4, 3b + addiu s7, 2 +4: + lbux t1, t6(t5) + sll t1, 1 + lbux t0, t6(s7) + sll t0, 1 + addu t1, t1, t0 + addu t3, t1, s6 + srl t0, t3, 2 /* t2 = pixval1 */ + xori s6, s6, 3 + addu t2, t1, s6 + srl t1, t2, 2 /* t3 = pixval2 */ + blez s2, 6f + append t1, t0, 8 +5: + ush t1, 0(t4) + addiu s2, -1 + bgtz s2, 5b + addiu t4, 2 +6: + beqz t9, 7f + nop + sb t0, 0(t4) +7: + addiu s1, 4 + addiu a2, -1 + bnez a2, 0b + addiu s0, 8 +8: + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_h2v2_downsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2) +/* + * a0 = input_data + * a1 = output_data + * a2 = compptr->v_samp_factor + * a3 = cinfo->max_v_samp_factor + * 16(sp) = cinfo->smoothing_factor + * 20(sp) = compptr->width_in_blocks + * 24(sp) = cinfo->image_width + */ + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw s7, 52(sp) /* compptr->width_in_blocks */ + lw s0, 56(sp) /* cinfo->image_width */ + lw s6, 48(sp) /* cinfo->smoothing_factor */ + sll s7, 3 /* output_cols = width_in_blocks * DCTSIZE */ + sll v0, s7, 1 + subu v0, v0, s0 + blez v0, 2f + move v1, zero + addiu t0, a3, 2 /* t0 = cinfo->max_v_samp_factor + 2 */ +0: + addiu t1, a0, -4 + sll t2, v1, 2 + lwx t1, t2(t1) + move t3, v0 + addu t1, t1, s0 + lbu t2, -1(t1) +1: + addiu t3, t3, -1 + sb t2, 0(t1) + bgtz t3, 1b + addiu t1, t1, 1 + addiu v1, v1, 1 + bne v1, t0, 0b + nop +2: + li v0, 80 + mul v0, s6, v0 + li v1, 16384 + move t4, zero + move t5, zero + subu t6, v1, v0 /* t6 = 16384 - tmp_smoot_f * 80 */ + sll t7, s6, 4 /* t7 = tmp_smoot_f * 16 */ +3: +/* Special case for first column: pretend column -1 is same as column 0 */ + sll v0, t4, 2 + lwx t8, v0(a1) /* outptr = output_data[outrow] */ + sll v1, t5, 2 + addiu t9, v1, 4 + addiu s0, v1, -4 + addiu s1, v1, 8 + lwx s2, v1(a0) /* inptr0 = input_data[inrow] */ + lwx t9, t9(a0) /* inptr1 = input_data[inrow+1] */ + lwx s0, s0(a0) /* above_ptr = input_data[inrow-1] */ + lwx s1, s1(a0) /* below_ptr = input_data[inrow+2] */ + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, 0(s2) + lbu v1, 2(s2) + lbu t0, 0(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, 0(s0) + lbu t0, 0(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1, s3, t7 + extr_r.w v0, $ac1, 16 + addiu t8, t8, 1 + addiu s2, s2, 2 + addiu t9, t9, 2 + addiu s0, s0, 2 + addiu s1, s1, 2 + sb v0, -1(t8) + addiu s4, s7, -2 + and s4, s4, 3 + addu s5, s4, t8 /* end address */ +4: + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 2(s2) + lbu t0, -1(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + addiu t8, t8, 1 + addiu s2, s2, 2 + addiu t9, t9, 2 + addiu s0, s0, 2 + sb t2, -1(t8) + bne s5, t8, 4b + addiu s1, s1, 2 + addiu s5, s7, -2 + subu s5, s5, s4 + addu s5, s5, t8 /* end address */ +5: + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 2(s2) + lbu t0, -1(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + lh v1, 2(t9) + addu t0, t0, v0 + lh v0, 2(s2) + addu s3, t0, s3 + lh t0, 2(s0) + lh t1, 2(s1) + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 4(s2) + lbu t0, 1(t9) + lbu t1, 4(t9) + sb t2, 0(t8) + raddu.w.qb t3, v0 + lbu v0, 1(s2) + addu t0, t0, t1 + mult $ac1, t3, t6 + addu v0, v0, v1 + lbu t2, 4(s0) + addu t0, t0, v0 + lbu v0, 1(s0) + addu s3, t0, s3 + lbu t0, 1(s1) + lbu t3, 4(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + lh v1, 4(t9) + addu t0, t0, v0 + lh v0, 4(s2) + addu s3, t0, s3 + lh t0, 4(s0) + lh t1, 4(s1) + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 6(s2) + lbu t0, 3(t9) + lbu t1, 6(t9) + sb t2, 1(t8) + raddu.w.qb t3, v0 + lbu v0, 3(s2) + addu t0, t0, t1 + mult $ac1, t3, t6 + addu v0, v0, v1 + lbu t2, 6(s0) + addu t0, t0, v0 + lbu v0, 3(s0) + addu s3, t0, s3 + lbu t0, 3(s1) + lbu t3, 6(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + lh v1, 6(t9) + addu t0, t0, v0 + lh v0, 6(s2) + addu s3, t0, s3 + lh t0, 6(s0) + lh t1, 6(s1) + madd $ac1, s3, t7 + extr_r.w t3, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 8(s2) + lbu t0, 5(t9) + lbu t1, 8(t9) + sb t3, 2(t8) + raddu.w.qb t2, v0 + lbu v0, 5(s2) + addu t0, t0, t1 + mult $ac1, t2, t6 + addu v0, v0, v1 + lbu t2, 8(s0) + addu t0, t0, v0 + lbu v0, 5(s0) + addu s3, t0, s3 + lbu t0, 5(s1) + lbu t3, 8(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + addiu t8, t8, 4 + addu t0, t0, v0 + addiu s2, s2, 8 + addu s3, t0, s3 + addiu t9, t9, 8 + madd $ac1, s3, t7 + extr_r.w t1, $ac1, 16 + addiu s0, s0, 8 + addiu s1, s1, 8 + bne s5, t8, 5b + sb t1, -1(t8) +/* Special case for last column */ + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 1(s2) + lbu t0, -1(t9) + lbu t1, 1(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 1(s0) + addu t0, t0, v0 + lbu t3, 1(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1, s3, t7 + extr_r.w t0, $ac1, 16 + addiu t5, t5, 2 + sb t0, 0(t8) + addiu t4, t4, 1 + bne t4, a2, 3b + addiu t5, t5, 2 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_h2v2_smooth_downsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_int_upsample_dspr2) +/* + * a0 = upsample->h_expand[compptr->component_index] + * a1 = upsample->v_expand[compptr->component_index] + * a2 = input_data + * a3 = output_data_ptr + * 16(sp) = cinfo->output_width + * 20(sp) = cinfo->max_v_samp_factor + */ + .set at + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + lw s0, 0(a3) /* s0 = output_data */ + lw s1, 32(sp) /* s1 = cinfo->output_width */ + lw s2, 36(sp) /* s2 = cinfo->max_v_samp_factor */ + li t6, 0 /* t6 = inrow */ + beqz s2, 10f + li s3, 0 /* s3 = outrow */ +0: + addu t0, a2, t6 + addu t7, s0, s3 + lw t3, 0(t0) /* t3 = inptr */ + lw t8, 0(t7) /* t8 = outptr */ + beqz s1, 4f + addu t5, t8, s1 /* t5 = outend */ +1: + lb t2, 0(t3) /* t2 = invalue = *inptr++ */ + addiu t3, 1 + beqz a0, 3f + move t0, a0 /* t0 = h_expand */ +2: + sb t2, 0(t8) + addiu t0, -1 + bgtz t0, 2b + addiu t8, 1 +3: + bgt t5, t8, 1b + nop +4: + addiu t9, a1, -1 /* t9 = v_expand - 1 */ + blez t9, 9f + nop +5: + lw t3, 0(s0) + lw t4, 4(s0) + subu t0, s1, 0xF + blez t0, 7f + addu t5, t3, s1 /* t5 = end address */ + andi t7, s1, 0xF /* t7 = residual */ + subu t8, t5, t7 +6: + ulw t0, 0(t3) + ulw t1, 4(t3) + ulw t2, 8(t3) + usw t0, 0(t4) + ulw t0, 12(t3) + usw t1, 4(t4) + usw t2, 8(t4) + usw t0, 12(t4) + addiu t3, 16 + bne t3, t8, 6b + addiu t4, 16 + beqz t7, 8f + nop +7: + lbu t0, 0(t3) + sb t0, 0(t4) + addiu t3, 1 + bne t3, t5, 7b + addiu t4, 1 +8: + addiu t9, -1 + bgtz t9, 5b + addiu s0, 8 +9: + addu s3, s3, a1 + bne s3, s2, 0b + addiu t6, 1 +10: + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop +END(jsimd_int_upsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_h2v1_upsample_dspr2) +/* + * a0 = cinfo->max_v_samp_factor + * a1 = cinfo->output_width + * a2 = input_data + * a3 = output_data_ptr + */ + lw t7, 0(a3) /* t7 = output_data */ + andi t8, a1, 0xf /* t8 = residual */ + sll t0, a0, 2 + blez a0, 4f + addu t9, t7, t0 /* t9 = output_data end address */ +0: + lw t5, 0(t7) /* t5 = outptr */ + lw t6, 0(a2) /* t6 = inptr */ + addu t3, t5, a1 /* t3 = outptr + output_width (end address) */ + subu t3, t8 /* t3 = end address - residual */ + beq t5, t3, 2f + move t4, t8 +1: + ulw t0, 0(t6) /* t0 = |P3|P2|P1|P0| */ + ulw t2, 4(t6) /* t2 = |P7|P6|P5|P4| */ + srl t1, t0, 16 /* t1 = |X|X|P3|P2| */ + ins t0, t0, 16, 16 /* t0 = |P1|P0|P1|P0| */ + ins t1, t1, 16, 16 /* t1 = |P3|P2|P3|P2| */ + ins t0, t0, 8, 16 /* t0 = |P1|P1|P0|P0| */ + ins t1, t1, 8, 16 /* t1 = |P3|P3|P2|P2| */ + usw t0, 0(t5) + usw t1, 4(t5) + srl t0, t2, 16 /* t0 = |X|X|P7|P6| */ + ins t2, t2, 16, 16 /* t2 = |P5|P4|P5|P4| */ + ins t0, t0, 16, 16 /* t0 = |P7|P6|P7|P6| */ + ins t2, t2, 8, 16 /* t2 = |P5|P5|P4|P4| */ + ins t0, t0, 8, 16 /* t0 = |P7|P7|P6|P6| */ + usw t2, 8(t5) + usw t0, 12(t5) + addiu t5, 16 + bne t5, t3, 1b + addiu t6, 8 + beqz t8, 3f + move t4, t8 +2: + lbu t1, 0(t6) + sb t1, 0(t5) + sb t1, 1(t5) + addiu t4, -2 + addiu t6, 1 + bgtz t4, 2b + addiu t5, 2 +3: + addiu t7, 4 + bne t9, t7, 0b + addiu a2, 4 +4: + j ra + nop +END(jsimd_h2v1_upsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_h2v2_upsample_dspr2) +/* + * a0 = cinfo->max_v_samp_factor + * a1 = cinfo->output_width + * a2 = input_data + * a3 = output_data_ptr + */ + lw t7, 0(a3) + blez a0, 7f + andi t9, a1, 0xf /* t9 = residual */ +0: + lw t6, 0(a2) /* t6 = inptr */ + lw t5, 0(t7) /* t5 = outptr */ + addu t8, t5, a1 /* t8 = outptr end address */ + subu t8, t9 /* t8 = end address - residual */ + beq t5, t8, 2f + move t4, t9 +1: + ulw t0, 0(t6) + srl t1, t0, 16 + ins t0, t0, 16, 16 + ins t0, t0, 8, 16 + ins t1, t1, 16, 16 + ins t1, t1, 8, 16 + ulw t2, 4(t6) + usw t0, 0(t5) + usw t1, 4(t5) + srl t3, t2, 16 + ins t2, t2, 16, 16 + ins t2, t2, 8, 16 + ins t3, t3, 16, 16 + ins t3, t3, 8, 16 + usw t2, 8(t5) + usw t3, 12(t5) + addiu t5, 16 + bne t5, t8, 1b + addiu t6, 8 + beqz t9, 3f + move t4, t9 +2: + lbu t0, 0(t6) + sb t0, 0(t5) + sb t0, 1(t5) + addiu t4, -2 + addiu t6, 1 + bgtz t4, 2b + addiu t5, 2 +3: + lw t6, 0(t7) /* t6 = outptr[0] */ + lw t5, 4(t7) /* t5 = outptr[1] */ + addu t4, t6, a1 /* t4 = new end address */ + beq a1, t9, 5f + subu t8, t4, t9 +4: + ulw t0, 0(t6) + ulw t1, 4(t6) + ulw t2, 8(t6) + usw t0, 0(t5) + ulw t0, 12(t6) + usw t1, 4(t5) + usw t2, 8(t5) + usw t0, 12(t5) + addiu t6, 16 + bne t6, t8, 4b + addiu t5, 16 + beqz t9, 6f + nop +5: + lbu t0, 0(t6) + sb t0, 0(t5) + addiu t6, 1 + bne t6, t4, 5b + addiu t5, 1 +6: + addiu t7, 8 + addiu a0, -2 + bgtz a0, 0b + addiu a2, 4 +7: + j ra + nop +END(jsimd_h2v2_upsample_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_islow_dspr2) +/* + * a0 = coef_block + * a1 = compptr->dcttable + * a2 = output + * a3 = range_limit + */ + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu sp, sp, -256 + move v0, sp + addiu v1, zero, 8 /* v1 = DCTSIZE = 8 */ +1: + lh s4, 32(a0) /* s4 = inptr[16] */ + lh s5, 64(a0) /* s5 = inptr[32] */ + lh s6, 96(a0) /* s6 = inptr[48] */ + lh t1, 112(a0) /* t1 = inptr[56] */ + lh t7, 16(a0) /* t7 = inptr[8] */ + lh t5, 80(a0) /* t5 = inptr[40] */ + lh t3, 48(a0) /* t3 = inptr[24] */ + or s4, s4, t1 + or s4, s4, t3 + or s4, s4, t5 + or s4, s4, t7 + or s4, s4, s5 + or s4, s4, s6 + bnez s4, 2f + addiu v1, v1, -1 + lh s5, 0(a1) /* quantptr[DCTSIZE*0] */ + lh s6, 0(a0) /* inptr[DCTSIZE*0] */ + mul s5, s5, s6 /* DEQUANTIZE(inptr[0], quantptr[0]) */ + sll s5, s5, 2 + sw s5, 0(v0) + sw s5, 32(v0) + sw s5, 64(v0) + sw s5, 96(v0) + sw s5, 128(v0) + sw s5, 160(v0) + sw s5, 192(v0) + b 3f + sw s5, 224(v0) +2: + lh t0, 112(a1) + lh t2, 48(a1) + lh t4, 80(a1) + lh t6, 16(a1) + mul t0, t0, t1 /* DEQUANTIZE(inptr[DCTSIZE*7], + quantptr[DCTSIZE*7]) */ + mul t1, t2, t3 /* DEQUANTIZE(inptr[DCTSIZE*3], + quantptr[DCTSIZE*3]) */ + mul t2, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*5], + quantptr[DCTSIZE*5]) */ + mul t3, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*1], + quantptr[DCTSIZE*1]) */ + lh t4, 32(a1) + lh t5, 32(a0) + lh t6, 96(a1) + lh t7, 96(a0) + addu s0, t0, t1 /* z3 = tmp0 + tmp2 */ + addu s1, t1, t2 /* z2 = tmp1 + tmp2 */ + addu s2, t2, t3 /* z4 = tmp1 + tmp3 */ + addu s3, s0, s2 /* z3 + z4 */ + addiu t9, zero, 9633 /* FIX_1_175875602 */ + mul s3, s3, t9 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + addu t8, t0, t3 /* z1 = tmp0 + tmp3 */ + addiu t9, zero, 2446 /* FIX_0_298631336 */ + mul t0, t0, t9 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + addiu t9, zero, 16819 /* FIX_2_053119869 */ + mul t2, t2, t9 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + addiu t9, zero, 25172 /* FIX_3_072711026 */ + mul t1, t1, t9 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + addiu t9, zero, 12299 /* FIX_1_501321110 */ + mul t3, t3, t9 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + addiu t9, zero, 16069 /* FIX_1_961570560 */ + mul s0, s0, t9 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */ + addiu t9, zero, 3196 /* FIX_0_390180644 */ + mul s2, s2, t9 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */ + addiu t9, zero, 7373 /* FIX_0_899976223 */ + mul t8, t8, t9 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */ + addiu t9, zero, 20995 /* FIX_2_562915447 */ + mul s1, s1, t9 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */ + subu s0, s3, s0 /* z3 += z5 */ + addu t0, t0, s0 /* tmp0 += z3 */ + addu t1, t1, s0 /* tmp2 += z3 */ + subu s2, s3, s2 /* z4 += z5 */ + addu t2, t2, s2 /* tmp1 += z4 */ + addu t3, t3, s2 /* tmp3 += z4 */ + subu t0, t0, t8 /* tmp0 += z1 */ + subu t1, t1, s1 /* tmp2 += z2 */ + subu t2, t2, s1 /* tmp1 += z2 */ + subu t3, t3, t8 /* tmp3 += z1 */ + mul s0, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*2], + quantptr[DCTSIZE*2]) */ + addiu t9, zero, 6270 /* FIX_0_765366865 */ + mul s1, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*6], + quantptr[DCTSIZE*6]) */ + lh t4, 0(a1) + lh t5, 0(a0) + lh t6, 64(a1) + lh t7, 64(a0) + mul s2, t9, s0 /* MULTIPLY(z2, FIX_0_765366865) */ + mul t5, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*0], + quantptr[DCTSIZE*0]) */ + mul t6, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*4], + quantptr[DCTSIZE*4]) */ + addiu t9, zero, 4433 /* FIX_0_541196100 */ + addu s3, s0, s1 /* z2 + z3 */ + mul s3, s3, t9 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */ + addiu t9, zero, 15137 /* FIX_1_847759065 */ + mul t8, s1, t9 /* MULTIPLY(z3, FIX_1_847759065) */ + addu t4, t5, t6 + subu t5, t5, t6 + sll t4, t4, 13 /* tmp0 = (z2 + z3) << CONST_BITS */ + sll t5, t5, 13 /* tmp1 = (z2 - z3) << CONST_BITS */ + addu t7, s3, s2 /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */ + subu t6, s3, t8 /* tmp2 = + z1 + MULTIPLY(z3, -FIX_1_847759065) */ + addu s0, t4, t7 + subu s1, t4, t7 + addu s2, t5, t6 + subu s3, t5, t6 + addu t4, s0, t3 + subu s0, s0, t3 + addu t3, s2, t1 + subu s2, s2, t1 + addu t1, s3, t2 + subu s3, s3, t2 + addu t2, s1, t0 + subu s1, s1, t0 + shra_r.w t4, t4, 11 + shra_r.w t3, t3, 11 + shra_r.w t1, t1, 11 + shra_r.w t2, t2, 11 + shra_r.w s1, s1, 11 + shra_r.w s3, s3, 11 + shra_r.w s2, s2, 11 + shra_r.w s0, s0, 11 + sw t4, 0(v0) + sw t3, 32(v0) + sw t1, 64(v0) + sw t2, 96(v0) + sw s1, 128(v0) + sw s3, 160(v0) + sw s2, 192(v0) + sw s0, 224(v0) +3: + addiu a1, a1, 2 + addiu a0, a0, 2 + bgtz v1, 1b + addiu v0, v0, 4 + move v0, sp + addiu v1, zero, 8 +4: + lw t0, 8(v0) /* z2 = (JLONG)wsptr[2] */ + lw t1, 24(v0) /* z3 = (JLONG)wsptr[6] */ + lw t2, 0(v0) /* (JLONG)wsptr[0] */ + lw t3, 16(v0) /* (JLONG)wsptr[4] */ + lw s4, 4(v0) /* (JLONG)wsptr[1] */ + lw s5, 12(v0) /* (JLONG)wsptr[3] */ + lw s6, 20(v0) /* (JLONG)wsptr[5] */ + lw s7, 28(v0) /* (JLONG)wsptr[7] */ + or s4, s4, t0 + or s4, s4, t1 + or s4, s4, t3 + or s4, s4, s7 + or s4, s4, s5 + or s4, s4, s6 + bnez s4, 5f + addiu v1, v1, -1 + shra_r.w s5, t2, 5 + andi s5, s5, 0x3ff + lbux s5, s5(a3) + lw s1, 0(a2) + replv.qb s5, s5 + usw s5, 0(s1) + usw s5, 4(s1) + b 6f + nop +5: + addu t4, t0, t1 /* z2 + z3 */ + addiu t8, zero, 4433 /* FIX_0_541196100 */ + mul t5, t4, t8 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */ + addiu t8, zero, 15137 /* FIX_1_847759065 */ + mul t1, t1, t8 /* MULTIPLY(z3, FIX_1_847759065) */ + addiu t8, zero, 6270 /* FIX_0_765366865 */ + mul t0, t0, t8 /* MULTIPLY(z2, FIX_0_765366865) */ + addu t4, t2, t3 /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */ + subu t2, t2, t3 /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */ + sll t4, t4, 13 /* tmp0 = + (wsptr[0] + wsptr[4]) << CONST_BITS */ + sll t2, t2, 13 /* tmp1 = + (wsptr[0] - wsptr[4]) << CONST_BITS */ + subu t1, t5, t1 /* tmp2 = + z1 + MULTIPLY(z3, -FIX_1_847759065) */ + subu t3, t2, t1 /* tmp12 = tmp1 - tmp2 */ + addu t2, t2, t1 /* tmp11 = tmp1 + tmp2 */ + addu t5, t5, t0 /* tmp3 = + z1 + MULTIPLY(z2, FIX_0_765366865) */ + subu t1, t4, t5 /* tmp13 = tmp0 - tmp3 */ + addu t0, t4, t5 /* tmp10 = tmp0 + tmp3 */ + lw t4, 28(v0) /* tmp0 = (JLONG)wsptr[7] */ + lw t6, 12(v0) /* tmp2 = (JLONG)wsptr[3] */ + lw t5, 20(v0) /* tmp1 = (JLONG)wsptr[5] */ + lw t7, 4(v0) /* tmp3 = (JLONG)wsptr[1] */ + addu s0, t4, t6 /* z3 = tmp0 + tmp2 */ + addiu t8, zero, 9633 /* FIX_1_175875602 */ + addu s1, t5, t7 /* z4 = tmp1 + tmp3 */ + addu s2, s0, s1 /* z3 + z4 */ + mul s2, s2, t8 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + addu s3, t4, t7 /* z1 = tmp0 + tmp3 */ + addu t9, t5, t6 /* z2 = tmp1 + tmp2 */ + addiu t8, zero, 16069 /* FIX_1_961570560 */ + mul s0, s0, t8 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */ + addiu t8, zero, 3196 /* FIX_0_390180644 */ + mul s1, s1, t8 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */ + addiu t8, zero, 2446 /* FIX_0_298631336 */ + mul t4, t4, t8 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + addiu t8, zero, 7373 /* FIX_0_899976223 */ + mul s3, s3, t8 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */ + addiu t8, zero, 16819 /* FIX_2_053119869 */ + mul t5, t5, t8 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + addiu t8, zero, 20995 /* FIX_2_562915447 */ + mul t9, t9, t8 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */ + addiu t8, zero, 25172 /* FIX_3_072711026 */ + mul t6, t6, t8 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + addiu t8, zero, 12299 /* FIX_1_501321110 */ + mul t7, t7, t8 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + subu s0, s2, s0 /* z3 += z5 */ + subu s1, s2, s1 /* z4 += z5 */ + addu t4, t4, s0 + subu t4, t4, s3 /* tmp0 */ + addu t5, t5, s1 + subu t5, t5, t9 /* tmp1 */ + addu t6, t6, s0 + subu t6, t6, t9 /* tmp2 */ + addu t7, t7, s1 + subu t7, t7, s3 /* tmp3 */ + addu s0, t0, t7 + subu t0, t0, t7 + addu t7, t2, t6 + subu t2, t2, t6 + addu t6, t3, t5 + subu t3, t3, t5 + addu t5, t1, t4 + subu t1, t1, t4 + shra_r.w s0, s0, 18 + shra_r.w t7, t7, 18 + shra_r.w t6, t6, 18 + shra_r.w t5, t5, 18 + shra_r.w t1, t1, 18 + shra_r.w t3, t3, 18 + shra_r.w t2, t2, 18 + shra_r.w t0, t0, 18 + andi s0, s0, 0x3ff + andi t7, t7, 0x3ff + andi t6, t6, 0x3ff + andi t5, t5, 0x3ff + andi t1, t1, 0x3ff + andi t3, t3, 0x3ff + andi t2, t2, 0x3ff + andi t0, t0, 0x3ff + lw s1, 0(a2) + lbux s0, s0(a3) + lbux t7, t7(a3) + lbux t6, t6(a3) + lbux t5, t5(a3) + lbux t1, t1(a3) + lbux t3, t3(a3) + lbux t2, t2(a3) + lbux t0, t0(a3) + sb s0, 0(s1) + sb t7, 1(s1) + sb t6, 2(s1) + sb t5, 3(s1) + sb t1, 4(s1) + sb t3, 5(s1) + sb t2, 6(s1) + sb t0, 7(s1) +6: + addiu v0, v0, 32 + bgtz v1, 4b + addiu a2, a2, 4 + addiu sp, sp, 256 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_islow_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2) +/* + * a0 = inptr + * a1 = quantptr + * a2 = wsptr + * a3 = mips_idct_ifast_coefs + */ + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu t9, a0, 16 /* end address */ + or AT, a3, zero + +0: + lw s0, 0(a1) /* quantptr[DCTSIZE*0] */ + lw t0, 0(a0) /* inptr[DCTSIZE*0] */ + lw t1, 16(a0) /* inptr[DCTSIZE*1] */ + muleq_s.w.phl v0, t0, s0 /* tmp0 ... */ + lw t2, 32(a0) /* inptr[DCTSIZE*2] */ + lw t3, 48(a0) /* inptr[DCTSIZE*3] */ + lw t4, 64(a0) /* inptr[DCTSIZE*4] */ + lw t5, 80(a0) /* inptr[DCTSIZE*5] */ + muleq_s.w.phr t0, t0, s0 /* ... tmp0 ... */ + lw t6, 96(a0) /* inptr[DCTSIZE*6] */ + lw t7, 112(a0) /* inptr[DCTSIZE*7] */ + or s4, t1, t2 + or s5, t3, t4 + bnez s4, 1f + ins t0, v0, 16, 16 /* ... tmp0 */ + bnez s5, 1f + or s6, t5, t6 + or s6, s6, t7 + bnez s6, 1f + sw t0, 0(a2) /* wsptr[DCTSIZE*0] */ + sw t0, 16(a2) /* wsptr[DCTSIZE*1] */ + sw t0, 32(a2) /* wsptr[DCTSIZE*2] */ + sw t0, 48(a2) /* wsptr[DCTSIZE*3] */ + sw t0, 64(a2) /* wsptr[DCTSIZE*4] */ + sw t0, 80(a2) /* wsptr[DCTSIZE*5] */ + sw t0, 96(a2) /* wsptr[DCTSIZE*6] */ + sw t0, 112(a2) /* wsptr[DCTSIZE*7] */ + addiu a0, a0, 4 + b 2f + addiu a1, a1, 4 + +1: + lw s1, 32(a1) /* quantptr[DCTSIZE*2] */ + lw s2, 64(a1) /* quantptr[DCTSIZE*4] */ + muleq_s.w.phl v0, t2, s1 /* tmp1 ... */ + muleq_s.w.phr t2, t2, s1 /* ... tmp1 ... */ + lw s0, 16(a1) /* quantptr[DCTSIZE*1] */ + lw s1, 48(a1) /* quantptr[DCTSIZE*3] */ + lw s3, 96(a1) /* quantptr[DCTSIZE*6] */ + muleq_s.w.phl v1, t4, s2 /* tmp2 ... */ + muleq_s.w.phr t4, t4, s2 /* ... tmp2 ... */ + lw s2, 80(a1) /* quantptr[DCTSIZE*5] */ + lw t8, 4(AT) /* FIX(1.414213562) */ + ins t2, v0, 16, 16 /* ... tmp1 */ + muleq_s.w.phl v0, t6, s3 /* tmp3 ... */ + muleq_s.w.phr t6, t6, s3 /* ... tmp3 ... */ + ins t4, v1, 16, 16 /* ... tmp2 */ + addq.ph s4, t0, t4 /* tmp10 */ + subq.ph s5, t0, t4 /* tmp11 */ + ins t6, v0, 16, 16 /* ... tmp3 */ + subq.ph s6, t2, t6 /* tmp12 ... */ + addq.ph s7, t2, t6 /* tmp13 */ + mulq_s.ph s6, s6, t8 /* ... tmp12 ... */ + addq.ph t0, s4, s7 /* tmp0 */ + subq.ph t6, s4, s7 /* tmp3 */ + muleq_s.w.phl v0, t1, s0 /* tmp4 ... */ + muleq_s.w.phr t1, t1, s0 /* ... tmp4 ... */ + shll_s.ph s6, s6, 1 /* x2 */ + lw s3, 112(a1) /* quantptr[DCTSIZE*7] */ + subq.ph s6, s6, s7 /* ... tmp12 */ + muleq_s.w.phl v1, t7, s3 /* tmp7 ... */ + muleq_s.w.phr t7, t7, s3 /* ... tmp7 ... */ + ins t1, v0, 16, 16 /* ... tmp4 */ + addq.ph t2, s5, s6 /* tmp1 */ + subq.ph t4, s5, s6 /* tmp2 */ + muleq_s.w.phl v0, t5, s2 /* tmp6 ... */ + muleq_s.w.phr t5, t5, s2 /* ... tmp6 ... */ + ins t7, v1, 16, 16 /* ... tmp7 */ + addq.ph s5, t1, t7 /* z11 */ + subq.ph s6, t1, t7 /* z12 */ + muleq_s.w.phl v1, t3, s1 /* tmp5 ... */ + muleq_s.w.phr t3, t3, s1 /* ... tmp5 ... */ + ins t5, v0, 16, 16 /* ... tmp6 */ + ins t3, v1, 16, 16 /* ... tmp5 */ + addq.ph s7, t5, t3 /* z13 */ + subq.ph v0, t5, t3 /* z10 */ + addq.ph t7, s5, s7 /* tmp7 */ + subq.ph s5, s5, s7 /* tmp11 ... */ + addq.ph v1, v0, s6 /* z5 ... */ + mulq_s.ph s5, s5, t8 /* ... tmp11 */ + lw t8, 8(AT) /* FIX(1.847759065) */ + lw s4, 0(AT) /* FIX(1.082392200) */ + addq.ph s0, t0, t7 + subq.ph s1, t0, t7 + mulq_s.ph v1, v1, t8 /* ... z5 */ + shll_s.ph s5, s5, 1 /* x2 */ + lw t8, 12(AT) /* FIX(-2.613125930) */ + sw s0, 0(a2) /* wsptr[DCTSIZE*0] */ + shll_s.ph v0, v0, 1 /* x4 */ + mulq_s.ph v0, v0, t8 /* tmp12 ... */ + mulq_s.ph s4, s6, s4 /* tmp10 ... */ + shll_s.ph v1, v1, 1 /* x2 */ + addiu a0, a0, 4 + addiu a1, a1, 4 + sw s1, 112(a2) /* wsptr[DCTSIZE*7] */ + shll_s.ph s6, v0, 1 /* x4 */ + shll_s.ph s4, s4, 1 /* x2 */ + addq.ph s6, s6, v1 /* ... tmp12 */ + subq.ph t5, s6, t7 /* tmp6 */ + subq.ph s4, s4, v1 /* ... tmp10 */ + subq.ph t3, s5, t5 /* tmp5 */ + addq.ph s2, t2, t5 + addq.ph t1, s4, t3 /* tmp4 */ + subq.ph s3, t2, t5 + sw s2, 16(a2) /* wsptr[DCTSIZE*1] */ + sw s3, 96(a2) /* wsptr[DCTSIZE*6] */ + addq.ph v0, t4, t3 + subq.ph v1, t4, t3 + sw v0, 32(a2) /* wsptr[DCTSIZE*2] */ + sw v1, 80(a2) /* wsptr[DCTSIZE*5] */ + addq.ph v0, t6, t1 + subq.ph v1, t6, t1 + sw v0, 64(a2) /* wsptr[DCTSIZE*4] */ + sw v1, 48(a2) /* wsptr[DCTSIZE*3] */ + +2: + bne a0, t9, 0b + addiu a2, a2, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_ifast_cols_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2) +/* + * a0 = wsptr + * a1 = output_buf + * a2 = output_col + * a3 = mips_idct_ifast_coefs + */ + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 + + addiu t9, a0, 128 /* end address */ + lui s8, 0x8080 + ori s8, s8, 0x8080 + +0: + lw AT, 36(sp) /* restore $a3 (mips_idct_ifast_coefs) */ + lw t0, 0(a0) /* wsptr[DCTSIZE*0+0/1] b a */ + lw s0, 16(a0) /* wsptr[DCTSIZE*1+0/1] B A */ + lw t2, 4(a0) /* wsptr[DCTSIZE*0+2/3] d c */ + lw s2, 20(a0) /* wsptr[DCTSIZE*1+2/3] D C */ + lw t4, 8(a0) /* wsptr[DCTSIZE*0+4/5] f e */ + lw s4, 24(a0) /* wsptr[DCTSIZE*1+4/5] F E */ + lw t6, 12(a0) /* wsptr[DCTSIZE*0+6/7] h g */ + lw s6, 28(a0) /* wsptr[DCTSIZE*1+6/7] H G */ + precrq.ph.w t1, s0, t0 /* B b */ + ins t0, s0, 16, 16 /* A a */ + bnez t1, 1f + or s0, t2, s2 + bnez s0, 1f + or s0, t4, s4 + bnez s0, 1f + or s0, t6, s6 + bnez s0, 1f + shll_s.ph s0, t0, 2 /* A a */ + lw a3, 0(a1) + lw AT, 4(a1) + precrq.ph.w t0, s0, s0 /* A A */ + ins s0, s0, 16, 16 /* a a */ + addu a3, a3, a2 + addu AT, AT, a2 + precrq.qb.ph t0, t0, t0 /* A A A A */ + precrq.qb.ph s0, s0, s0 /* a a a a */ + addu.qb s0, s0, s8 + addu.qb t0, t0, s8 + sw s0, 0(a3) + sw s0, 4(a3) + sw t0, 0(AT) + sw t0, 4(AT) + addiu a0, a0, 32 + bne a0, t9, 0b + addiu a1, a1, 8 + b 2f + nop + +1: + precrq.ph.w t3, s2, t2 + ins t2, s2, 16, 16 + precrq.ph.w t5, s4, t4 + ins t4, s4, 16, 16 + precrq.ph.w t7, s6, t6 + ins t6, s6, 16, 16 + lw t8, 4(AT) /* FIX(1.414213562) */ + addq.ph s4, t0, t4 /* tmp10 */ + subq.ph s5, t0, t4 /* tmp11 */ + subq.ph s6, t2, t6 /* tmp12 ... */ + addq.ph s7, t2, t6 /* tmp13 */ + mulq_s.ph s6, s6, t8 /* ... tmp12 ... */ + addq.ph t0, s4, s7 /* tmp0 */ + subq.ph t6, s4, s7 /* tmp3 */ + shll_s.ph s6, s6, 1 /* x2 */ + subq.ph s6, s6, s7 /* ... tmp12 */ + addq.ph t2, s5, s6 /* tmp1 */ + subq.ph t4, s5, s6 /* tmp2 */ + addq.ph s5, t1, t7 /* z11 */ + subq.ph s6, t1, t7 /* z12 */ + addq.ph s7, t5, t3 /* z13 */ + subq.ph v0, t5, t3 /* z10 */ + addq.ph t7, s5, s7 /* tmp7 */ + subq.ph s5, s5, s7 /* tmp11 ... */ + addq.ph v1, v0, s6 /* z5 ... */ + mulq_s.ph s5, s5, t8 /* ... tmp11 */ + lw t8, 8(AT) /* FIX(1.847759065) */ + lw s4, 0(AT) /* FIX(1.082392200) */ + addq.ph s0, t0, t7 /* tmp0 + tmp7 */ + subq.ph s7, t0, t7 /* tmp0 - tmp7 */ + mulq_s.ph v1, v1, t8 /* ... z5 */ + lw a3, 0(a1) + lw t8, 12(AT) /* FIX(-2.613125930) */ + shll_s.ph s5, s5, 1 /* x2 */ + addu a3, a3, a2 + shll_s.ph v0, v0, 1 /* x4 */ + mulq_s.ph v0, v0, t8 /* tmp12 ... */ + mulq_s.ph s4, s6, s4 /* tmp10 ... */ + shll_s.ph v1, v1, 1 /* x2 */ + addiu a0, a0, 32 + addiu a1, a1, 8 + shll_s.ph s6, v0, 1 /* x4 */ + shll_s.ph s4, s4, 1 /* x2 */ + addq.ph s6, s6, v1 /* ... tmp12 */ + shll_s.ph s0, s0, 2 + subq.ph t5, s6, t7 /* tmp6 */ + subq.ph s4, s4, v1 /* ... tmp10 */ + subq.ph t3, s5, t5 /* tmp5 */ + shll_s.ph s7, s7, 2 + addq.ph t1, s4, t3 /* tmp4 */ + addq.ph s1, t2, t5 /* tmp1 + tmp6 */ + subq.ph s6, t2, t5 /* tmp1 - tmp6 */ + addq.ph s2, t4, t3 /* tmp2 + tmp5 */ + subq.ph s5, t4, t3 /* tmp2 - tmp5 */ + addq.ph s4, t6, t1 /* tmp3 + tmp4 */ + subq.ph s3, t6, t1 /* tmp3 - tmp4 */ + shll_s.ph s1, s1, 2 + shll_s.ph s2, s2, 2 + shll_s.ph s3, s3, 2 + shll_s.ph s4, s4, 2 + shll_s.ph s5, s5, 2 + shll_s.ph s6, s6, 2 + precrq.ph.w t0, s1, s0 /* B A */ + ins s0, s1, 16, 16 /* b a */ + precrq.ph.w t2, s3, s2 /* D C */ + ins s2, s3, 16, 16 /* d c */ + precrq.ph.w t4, s5, s4 /* F E */ + ins s4, s5, 16, 16 /* f e */ + precrq.ph.w t6, s7, s6 /* H G */ + ins s6, s7, 16, 16 /* h g */ + precrq.qb.ph t0, t2, t0 /* D C B A */ + precrq.qb.ph s0, s2, s0 /* d c b a */ + precrq.qb.ph t4, t6, t4 /* H G F E */ + precrq.qb.ph s4, s6, s4 /* h g f e */ + addu.qb s0, s0, s8 + addu.qb s4, s4, s8 + sw s0, 0(a3) /* outptr[0/1/2/3] d c b a */ + sw s4, 4(a3) /* outptr[4/5/6/7] h g f e */ + lw a3, -4(a1) + addu.qb t0, t0, s8 + addu a3, a3, a2 + addu.qb t4, t4, s8 + sw t0, 0(a3) /* outptr[0/1/2/3] D C B A */ + bne a0, t9, 0b + sw t4, 4(a3) /* outptr[4/5/6/7] H G F E */ + +2: + + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 + + j ra + nop + +END(jsimd_idct_ifast_rows_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_fdct_islow_dspr2) +/* + * a0 = data + */ + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 + + lui t0, 6437 + ori t0, 2260 + lui t1, 9633 + ori t1, 11363 + lui t2, 0xd39e + ori t2, 0xe6dc + lui t3, 0xf72d + ori t3, 9633 + lui t4, 2261 + ori t4, 9633 + lui t5, 0xd39e + ori t5, 6437 + lui t6, 9633 + ori t6, 0xd39d + lui t7, 0xe6dc + ori t7, 2260 + lui t8, 4433 + ori t8, 10703 + lui t9, 0xd630 + ori t9, 4433 + li s8, 8 + move a1, a0 +1: + lw s0, 0(a1) /* tmp0 = 1|0 */ + lw s1, 4(a1) /* tmp1 = 3|2 */ + lw s2, 8(a1) /* tmp2 = 5|4 */ + lw s3, 12(a1) /* tmp3 = 7|6 */ + packrl.ph s1, s1, s1 /* tmp1 = 2|3 */ + packrl.ph s3, s3, s3 /* tmp3 = 6|7 */ + subq.ph s7, s1, s2 /* tmp7 = 2-5|3-4 = t5|t4 */ + subq.ph s5, s0, s3 /* tmp5 = 1-6|0-7 = t6|t7 */ + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, s7, t0 /* ac0 += t5* 6437 + t4* 2260 */ + dpa.w.ph $ac0, s5, t1 /* ac0 += t6* 9633 + t7* 11363 */ + mult $ac1, $0, $0 /* ac1 = 0 */ + dpa.w.ph $ac1, s7, t2 /* ac1 += t5*-11362 + t4* -6436 */ + dpa.w.ph $ac1, s5, t3 /* ac1 += t6* -2259 + t7* 9633 */ + mult $ac2, $0, $0 /* ac2 = 0 */ + dpa.w.ph $ac2, s7, t4 /* ac2 += t5* 2261 + t4* 9633 */ + dpa.w.ph $ac2, s5, t5 /* ac2 += t6*-11362 + t7* 6437 */ + mult $ac3, $0, $0 /* ac3 = 0 */ + dpa.w.ph $ac3, s7, t6 /* ac3 += t5* 9633 + t4*-11363 */ + dpa.w.ph $ac3, s5, t7 /* ac3 += t6* -6436 + t7* 2260 */ + addq.ph s6, s1, s2 /* tmp6 = 2+5|3+4 = t2|t3 */ + addq.ph s4, s0, s3 /* tmp4 = 1+6|0+7 = t1|t0 */ + extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */ + extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */ + extr_r.w s2, $ac2, 11 /* tmp2 = (ac2 + 1024) >> 11 */ + extr_r.w s3, $ac3, 11 /* tmp3 = (ac3 + 1024) >> 11 */ + addq.ph s5, s4, s6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */ + subq.ph s7, s4, s6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */ + sh s0, 2(a1) + sh s1, 6(a1) + sh s2, 10(a1) + sh s3, 14(a1) + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, s7, t8 /* ac0 += t12* 4433 + t13* 10703 */ + mult $ac1, $0, $0 /* ac1 = 0 */ + dpa.w.ph $ac1, s7, t9 /* ac1 += t12*-10704 + t13* 4433 */ + sra s4, s5, 16 /* tmp4 = t11 */ + addiu a1, a1, 16 + addiu s8, s8, -1 + extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */ + extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */ + addu s2, s5, s4 /* tmp2 = t10 + t11 */ + subu s3, s5, s4 /* tmp3 = t10 - t11 */ + sll s2, s2, 2 /* tmp2 = (t10 + t11) << 2 */ + sll s3, s3, 2 /* tmp3 = (t10 - t11) << 2 */ + sh s2, -16(a1) + sh s3, -8(a1) + sh s0, -12(a1) + bgtz s8, 1b + sh s1, -4(a1) + li t0, 2260 + li t1, 11363 + li t2, 9633 + li t3, 6436 + li t4, 6437 + li t5, 2261 + li t6, 11362 + li t7, 2259 + li t8, 4433 + li t9, 10703 + li a1, 10704 + li s8, 8 + +2: + lh a2, 0(a0) /* 0 */ + lh a3, 16(a0) /* 8 */ + lh v0, 32(a0) /* 16 */ + lh v1, 48(a0) /* 24 */ + lh s4, 64(a0) /* 32 */ + lh s5, 80(a0) /* 40 */ + lh s6, 96(a0) /* 48 */ + lh s7, 112(a0) /* 56 */ + addu s2, v0, s5 /* tmp2 = 16 + 40 */ + subu s5, v0, s5 /* tmp5 = 16 - 40 */ + addu s3, v1, s4 /* tmp3 = 24 + 32 */ + subu s4, v1, s4 /* tmp4 = 24 - 32 */ + addu s0, a2, s7 /* tmp0 = 0 + 56 */ + subu s7, a2, s7 /* tmp7 = 0 - 56 */ + addu s1, a3, s6 /* tmp1 = 8 + 48 */ + subu s6, a3, s6 /* tmp6 = 8 - 48 */ + addu a2, s0, s3 /* tmp10 = tmp0 + tmp3 */ + subu v1, s0, s3 /* tmp13 = tmp0 - tmp3 */ + addu a3, s1, s2 /* tmp11 = tmp1 + tmp2 */ + subu v0, s1, s2 /* tmp12 = tmp1 - tmp2 */ + mult s7, t1 /* ac0 = tmp7 * c1 */ + madd s4, t0 /* ac0 += tmp4 * c0 */ + madd s5, t4 /* ac0 += tmp5 * c4 */ + madd s6, t2 /* ac0 += tmp6 * c2 */ + mult $ac1, s7, t2 /* ac1 = tmp7 * c2 */ + msub $ac1, s4, t3 /* ac1 -= tmp4 * c3 */ + msub $ac1, s5, t6 /* ac1 -= tmp5 * c6 */ + msub $ac1, s6, t7 /* ac1 -= tmp6 * c7 */ + mult $ac2, s7, t4 /* ac2 = tmp7 * c4 */ + madd $ac2, s4, t2 /* ac2 += tmp4 * c2 */ + madd $ac2, s5, t5 /* ac2 += tmp5 * c5 */ + msub $ac2, s6, t6 /* ac2 -= tmp6 * c6 */ + mult $ac3, s7, t0 /* ac3 = tmp7 * c0 */ + msub $ac3, s4, t1 /* ac3 -= tmp4 * c1 */ + madd $ac3, s5, t2 /* ac3 += tmp5 * c2 */ + msub $ac3, s6, t3 /* ac3 -= tmp6 * c3 */ + extr_r.w s0, $ac0, 15 /* tmp0 = (ac0 + 16384) >> 15 */ + extr_r.w s1, $ac1, 15 /* tmp1 = (ac1 + 16384) >> 15 */ + extr_r.w s2, $ac2, 15 /* tmp2 = (ac2 + 16384) >> 15 */ + extr_r.w s3, $ac3, 15 /* tmp3 = (ac3 + 16384) >> 15 */ + addiu s8, s8, -1 + addu s4, a2, a3 /* tmp4 = tmp10 + tmp11 */ + subu s5, a2, a3 /* tmp5 = tmp10 - tmp11 */ + sh s0, 16(a0) + sh s1, 48(a0) + sh s2, 80(a0) + sh s3, 112(a0) + mult v0, t8 /* ac0 = tmp12 * c8 */ + madd v1, t9 /* ac0 += tmp13 * c9 */ + mult $ac1, v1, t8 /* ac1 = tmp13 * c8 */ + msub $ac1, v0, a1 /* ac1 -= tmp12 * c10 */ + addiu a0, a0, 2 + extr_r.w s6, $ac0, 15 /* tmp6 = (ac0 + 16384) >> 15 */ + extr_r.w s7, $ac1, 15 /* tmp7 = (ac1 + 16384) >> 15 */ + shra_r.w s4, s4, 2 /* tmp4 = (tmp4 + 2) >> 2 */ + shra_r.w s5, s5, 2 /* tmp5 = (tmp5 + 2) >> 2 */ + sh s4, -2(a0) + sh s5, 62(a0) + sh s6, 30(a0) + bgtz s8, 2b + sh s7, 94(a0) + + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 + + jr ra + nop + +END(jsimd_fdct_islow_dspr2) + + +/**************************************************************************/ +LEAF_DSPR2(jsimd_fdct_ifast_dspr2) +/* + * a0 = data + */ + .set at + + SAVE_REGS_ON_STACK 8, s0, s1 + + li a1, 0x014e014e /* FIX_1_306562965 (334 << 16) | + (334 & 0xffff) */ + li a2, 0x008b008b /* FIX_0_541196100 (139 << 16) | + (139 & 0xffff) */ + li a3, 0x00620062 /* FIX_0_382683433 (98 << 16) | + (98 & 0xffff) */ + li s1, 0x00b500b5 /* FIX_0_707106781 (181 << 16) | + (181 & 0xffff) */ + + move v0, a0 + addiu v1, v0, 128 /* end address */ + +0: + lw t0, 0(v0) /* tmp0 = 1|0 */ + lw t1, 4(v0) /* tmp1 = 3|2 */ + lw t2, 8(v0) /* tmp2 = 5|4 */ + lw t3, 12(v0) /* tmp3 = 7|6 */ + packrl.ph t1, t1, t1 /* tmp1 = 2|3 */ + packrl.ph t3, t3, t3 /* tmp3 = 6|7 */ + subq.ph t7, t1, t2 /* tmp7 = 2-5|3-4 = t5|t4 */ + subq.ph t5, t0, t3 /* tmp5 = 1-6|0-7 = t6|t7 */ + addq.ph t6, t1, t2 /* tmp6 = 2+5|3+4 = t2|t3 */ + addq.ph t4, t0, t3 /* tmp4 = 1+6|0+7 = t1|t0 */ + addq.ph t8, t4, t6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */ + subq.ph t9, t4, t6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */ + sra t4, t8, 16 /* tmp4 = t11 */ + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, t9, s1 + mult $ac1, $0, $0 /* ac1 = 0 */ + dpa.w.ph $ac1, t7, a3 /* ac1 += t4*98 + t5*98 */ + dpsx.w.ph $ac1, t5, a3 /* ac1 += t6*98 + t7*98 */ + mult $ac2, $0, $0 /* ac2 = 0 */ + dpa.w.ph $ac2, t7, a2 /* ac2 += t4*139 + t5*139 */ + mult $ac3, $0, $0 /* ac3 = 0 */ + dpa.w.ph $ac3, t5, a1 /* ac3 += t6*334 + t7*334 */ + precrq.ph.w t0, t5, t7 /* t0 = t5|t6 */ + addq.ph t2, t8, t4 /* tmp2 = t10 + t11 */ + subq.ph t3, t8, t4 /* tmp3 = t10 - t11 */ + extr.w t4, $ac0, 8 + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, t0, s1 /* ac0 += t5*181 + t6*181 */ + extr.w t0, $ac1, 8 /* t0 = z5 */ + extr.w t1, $ac2, 8 /* t1 = MULTIPLY(tmp10, 139) */ + extr.w t7, $ac3, 8 /* t2 = MULTIPLY(tmp12, 334) */ + extr.w t8, $ac0, 8 /* t8 = z3 = MULTIPLY(tmp11, 181) */ + add t6, t1, t0 /* t6 = z2 */ + add t7, t7, t0 /* t7 = z4 */ + subq.ph t0, t5, t8 /* t0 = z13 = tmp7 - z3 */ + addq.ph t8, t5, t8 /* t9 = z11 = tmp7 + z3 */ + addq.ph t1, t0, t6 /* t1 = z13 + z2 */ + subq.ph t6, t0, t6 /* t6 = z13 - z2 */ + addq.ph t0, t8, t7 /* t0 = z11 + z4 */ + subq.ph t7, t8, t7 /* t7 = z11 - z4 */ + addq.ph t5, t4, t9 + subq.ph t4, t9, t4 + sh t2, 0(v0) + sh t5, 4(v0) + sh t3, 8(v0) + sh t4, 12(v0) + sh t1, 10(v0) + sh t6, 6(v0) + sh t0, 2(v0) + sh t7, 14(v0) + addiu v0, 16 + bne v1, v0, 0b + nop + move v0, a0 + addiu v1, v0, 16 + +1: + lh t0, 0(v0) /* 0 */ + lh t1, 16(v0) /* 8 */ + lh t2, 32(v0) /* 16 */ + lh t3, 48(v0) /* 24 */ + lh t4, 64(v0) /* 32 */ + lh t5, 80(v0) /* 40 */ + lh t6, 96(v0) /* 48 */ + lh t7, 112(v0) /* 56 */ + add t8, t0, t7 /* t8 = tmp0 */ + sub t7, t0, t7 /* t7 = tmp7 */ + add t0, t1, t6 /* t0 = tmp1 */ + sub t1, t1, t6 /* t1 = tmp6 */ + add t6, t2, t5 /* t6 = tmp2 */ + sub t5, t2, t5 /* t5 = tmp5 */ + add t2, t3, t4 /* t2 = tmp3 */ + sub t3, t3, t4 /* t3 = tmp4 */ + add t4, t8, t2 /* t4 = tmp10 = tmp0 + tmp3 */ + sub t8, t8, t2 /* t8 = tmp13 = tmp0 - tmp3 */ + sub s0, t0, t6 /* s0 = tmp12 = tmp1 - tmp2 */ + ins t8, s0, 16, 16 /* t8 = tmp12|tmp13 */ + add t2, t0, t6 /* t2 = tmp11 = tmp1 + tmp2 */ + mult $0, $0 /* ac0 = 0 */ + dpa.w.ph $ac0, t8, s1 /* ac0 += t12*181 + t13*181 */ + add s0, t4, t2 /* t8 = tmp10+tmp11 */ + sub t4, t4, t2 /* t4 = tmp10-tmp11 */ + sh s0, 0(v0) + sh t4, 64(v0) + extr.w t2, $ac0, 8 /* z1 = MULTIPLY(tmp12+tmp13, + FIX_0_707106781) */ + addq.ph t4, t8, t2 /* t9 = tmp13 + z1 */ + subq.ph t8, t8, t2 /* t2 = tmp13 - z1 */ + sh t4, 32(v0) + sh t8, 96(v0) + add t3, t3, t5 /* t3 = tmp10 = tmp4 + tmp5 */ + add t0, t5, t1 /* t0 = tmp11 = tmp5 + tmp6 */ + add t1, t1, t7 /* t1 = tmp12 = tmp6 + tmp7 */ + andi t4, a1, 0xffff + mul s0, t1, t4 + sra s0, s0, 8 /* s0 = z4 = + MULTIPLY(tmp12, FIX_1_306562965) */ + ins t1, t3, 16, 16 /* t1 = tmp10|tmp12 */ + mult $0, $0 /* ac0 = 0 */ + mulsa.w.ph $ac0, t1, a3 /* ac0 += t10*98 - t12*98 */ + extr.w t8, $ac0, 8 /* z5 = MULTIPLY(tmp10-tmp12, + FIX_0_382683433) */ + add t2, t7, t8 /* t2 = tmp7 + z5 */ + sub t7, t7, t8 /* t7 = tmp7 - z5 */ + andi t4, a2, 0xffff + mul t8, t3, t4 + sra t8, t8, 8 /* t8 = z2 = + MULTIPLY(tmp10, FIX_0_541196100) */ + andi t4, s1, 0xffff + mul t6, t0, t4 + sra t6, t6, 8 /* t6 = z3 = + MULTIPLY(tmp11, FIX_0_707106781) */ + add t0, t6, t8 /* t0 = z3 + z2 */ + sub t1, t6, t8 /* t1 = z3 - z2 */ + add t3, t6, s0 /* t3 = z3 + z4 */ + sub t4, t6, s0 /* t4 = z3 - z4 */ + sub t5, t2, t1 /* t5 = dataptr[5] */ + sub t6, t7, t0 /* t6 = dataptr[3] */ + add t3, t2, t3 /* t3 = dataptr[1] */ + add t4, t7, t4 /* t4 = dataptr[7] */ + sh t5, 80(v0) + sh t6, 48(v0) + sh t3, 16(v0) + sh t4, 112(v0) + addiu v0, 2 + bne v0, v1, 1b + nop + + RESTORE_REGS_FROM_STACK 8, s0, s1 + + j ra + nop +END(jsimd_fdct_ifast_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_quantize_dspr2) +/* + * a0 = coef_block + * a1 = divisors + * a2 = workspace + */ + .set at + + SAVE_REGS_ON_STACK 16, s0, s1, s2 + + addiu v0, a2, 124 /* v0 = workspace_end */ + lh t0, 0(a2) + lh t1, 0(a1) + lh t2, 128(a1) + sra t3, t0, 15 + sll t3, t3, 1 + addiu t3, t3, 1 + mul t0, t0, t3 + lh t4, 384(a1) + lh t5, 130(a1) + lh t6, 2(a2) + lh t7, 2(a1) + lh t8, 386(a1) + +1: + andi t1, 0xffff + add t9, t0, t2 + andi t9, 0xffff + mul v1, t9, t1 + sra s0, t6, 15 + sll s0, s0, 1 + addiu s0, s0, 1 + addiu t9, t4, 16 + srav v1, v1, t9 + mul v1, v1, t3 + mul t6, t6, s0 + andi t7, 0xffff + addiu a2, a2, 4 + addiu a1, a1, 4 + add s1, t6, t5 + andi s1, 0xffff + sh v1, 0(a0) + + mul s2, s1, t7 + addiu s1, t8, 16 + srav s2, s2, s1 + mul s2, s2, s0 + lh t0, 0(a2) + lh t1, 0(a1) + sra t3, t0, 15 + sll t3, t3, 1 + addiu t3, t3, 1 + mul t0, t0, t3 + lh t2, 128(a1) + lh t4, 384(a1) + lh t5, 130(a1) + lh t8, 386(a1) + lh t6, 2(a2) + lh t7, 2(a1) + sh s2, 2(a0) + lh t0, 0(a2) + sra t3, t0, 15 + sll t3, t3, 1 + addiu t3, t3, 1 + mul t0, t0, t3 + bne a2, v0, 1b + addiu a0, a0, 4 + + andi t1, 0xffff + add t9, t0, t2 + andi t9, 0xffff + mul v1, t9, t1 + sra s0, t6, 15 + sll s0, s0, 1 + addiu s0, s0, 1 + addiu t9, t4, 16 + srav v1, v1, t9 + mul v1, v1, t3 + mul t6, t6, s0 + andi t7, 0xffff + sh v1, 0(a0) + add s1, t6, t5 + andi s1, 0xffff + mul s2, s1, t7 + addiu s1, t8, 16 + addiu a2, a2, 4 + addiu a1, a1, 4 + srav s2, s2, s1 + mul s2, s2, s0 + sh s2, 2(a0) + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2 + + j ra + nop + +END(jsimd_quantize_dspr2) + + +#ifndef __mips_soft_float + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_quantize_float_dspr2) +/* + * a0 = coef_block + * a1 = divisors + * a2 = workspace + */ + .set at + + li t1, 0x46800100 /* integer representation 16384.5 */ + mtc1 t1, f0 + li t0, 63 +0: + lwc1 f2, 0(a2) + lwc1 f10, 0(a1) + lwc1 f4, 4(a2) + lwc1 f12, 4(a1) + lwc1 f6, 8(a2) + lwc1 f14, 8(a1) + lwc1 f8, 12(a2) + lwc1 f16, 12(a1) + madd.s f2, f0, f2, f10 + madd.s f4, f0, f4, f12 + madd.s f6, f0, f6, f14 + madd.s f8, f0, f8, f16 + lwc1 f10, 16(a1) + lwc1 f12, 20(a1) + trunc.w.s f2, f2 + trunc.w.s f4, f4 + trunc.w.s f6, f6 + trunc.w.s f8, f8 + lwc1 f14, 24(a1) + lwc1 f16, 28(a1) + mfc1 t1, f2 + mfc1 t2, f4 + mfc1 t3, f6 + mfc1 t4, f8 + lwc1 f2, 16(a2) + lwc1 f4, 20(a2) + lwc1 f6, 24(a2) + lwc1 f8, 28(a2) + madd.s f2, f0, f2, f10 + madd.s f4, f0, f4, f12 + madd.s f6, f0, f6, f14 + madd.s f8, f0, f8, f16 + addiu t1, t1, -16384 + addiu t2, t2, -16384 + addiu t3, t3, -16384 + addiu t4, t4, -16384 + trunc.w.s f2, f2 + trunc.w.s f4, f4 + trunc.w.s f6, f6 + trunc.w.s f8, f8 + sh t1, 0(a0) + sh t2, 2(a0) + sh t3, 4(a0) + sh t4, 6(a0) + mfc1 t1, f2 + mfc1 t2, f4 + mfc1 t3, f6 + mfc1 t4, f8 + addiu t0, t0, -8 + addiu a2, a2, 32 + addiu a1, a1, 32 + addiu t1, t1, -16384 + addiu t2, t2, -16384 + addiu t3, t3, -16384 + addiu t4, t4, -16384 + sh t1, 8(a0) + sh t2, 10(a0) + sh t3, 12(a0) + sh t4, 14(a0) + bgez t0, 0b + addiu a0, a0, 16 + + j ra + nop + +END(jsimd_quantize_float_dspr2) + +#endif + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_2x2_dspr2) +/* + * a0 = compptr->dct_table + * a1 = coef_block + * a2 = output_buf + * a3 = output_col + */ + .set at + + SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 + + addiu sp, sp, -40 + move v0, sp + addiu s2, zero, 29692 + addiu s3, zero, -10426 + addiu s4, zero, 6967 + addiu s5, zero, -5906 + lh t0, 0(a1) /* t0 = inptr[DCTSIZE*0] */ + lh t5, 0(a0) /* t5 = quantptr[DCTSIZE*0] */ + lh t1, 48(a1) /* t1 = inptr[DCTSIZE*3] */ + lh t6, 48(a0) /* t6 = quantptr[DCTSIZE*3] */ + mul t4, t5, t0 + lh t0, 16(a1) /* t0 = inptr[DCTSIZE*1] */ + lh t5, 16(a0) /* t5 = quantptr[DCTSIZE*1] */ + mul t6, t6, t1 + mul t5, t5, t0 + lh t2, 80(a1) /* t2 = inptr[DCTSIZE*5] */ + lh t7, 80(a0) /* t7 = quantptr[DCTSIZE*5] */ + lh t3, 112(a1) /* t3 = inptr[DCTSIZE*7] */ + lh t8, 112(a0) /* t8 = quantptr[DCTSIZE*7] */ + mul t7, t7, t2 + mult zero, zero + mul t8, t8, t3 + li s0, 0x73FCD746 /* s0 = (29692 << 16) | (-10426 & 0xffff) */ + li s1, 0x1B37E8EE /* s1 = (6967 << 16) | (-5906 & 0xffff) */ + ins t6, t5, 16, 16 /* t6 = t5|t6 */ + sll t4, t4, 15 + dpa.w.ph $ac0, t6, s0 + lh t1, 2(a1) + lh t6, 2(a0) + ins t8, t7, 16, 16 /* t8 = t7|t8 */ + dpa.w.ph $ac0, t8, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 18(a1) + lh t6, 18(a0) + lh t2, 50(a1) + lh t7, 50(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 82(a1) + lh t2, 82(a0) + lh t3, 114(a1) + lh t4, 114(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 0(v0) + sw t8, 20(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + lh t1, 6(a1) + lh t6, 6(a0) + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 22(a1) + lh t6, 22(a0) + lh t2, 54(a1) + lh t7, 54(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 86(a1) + lh t2, 86(a0) + lh t3, 118(a1) + lh t4, 118(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 4(v0) + sw t8, 24(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + lh t1, 10(a1) + lh t6, 10(a0) + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 26(a1) + lh t6, 26(a0) + lh t2, 58(a1) + lh t7, 58(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 90(a1) + lh t2, 90(a0) + lh t3, 122(a1) + lh t4, 122(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 8(v0) + sw t8, 28(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + lh t1, 14(a1) + lh t6, 14(a0) + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 30(a1) + lh t6, 30(a0) + lh t2, 62(a1) + lh t7, 62(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 94(a1) + lh t2, 94(a0) + lh t3, 126(a1) + lh t4, 126(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 12(v0) + sw t8, 32(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + lw t9, 0(a2) + lw t3, 0(v0) + lw t7, 4(v0) + lw t1, 8(v0) + addu t9, t9, a3 + sll t3, t3, 15 + subu t8, t4, t0 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + shra_r.w t8, t8, 13 + sw t0, 16(v0) + sw t8, 36(v0) + lw t5, 12(v0) + lw t6, 16(v0) + mult t7, s2 + madd t1, s3 + madd t5, s4 + madd t6, s5 + lw t5, 24(v0) + lw t7, 28(v0) + mflo t0, $ac0 + lw t8, 32(v0) + lw t2, 36(v0) + mult $ac1, t5, s2 + madd $ac1, t7, s3 + madd $ac1, t8, s4 + madd $ac1, t2, s5 + addu t1, t3, t0 + subu t6, t3, t0 + shra_r.w t1, t1, 20 + shra_r.w t6, t6, 20 + mflo t4, $ac1 + shll_s.w t1, t1, 24 + shll_s.w t6, t6, 24 + sra t1, t1, 24 + sra t6, t6, 24 + addiu t1, t1, 128 + addiu t6, t6, 128 + lw t0, 20(v0) + sb t1, 0(t9) + sb t6, 1(t9) + sll t0, t0, 15 + lw t9, 4(a2) + addu t1, t0, t4 + subu t6, t0, t4 + addu t9, t9, a3 + shra_r.w t1, t1, 20 + shra_r.w t6, t6, 20 + shll_s.w t1, t1, 24 + shll_s.w t6, t6, 24 + sra t1, t1, 24 + sra t6, t6, 24 + addiu t1, t1, 128 + addiu t6, t6, 128 + sb t1, 0(t9) + sb t6, 1(t9) + addiu sp, sp, 40 + + RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 + + j ra + nop + +END(jsimd_idct_2x2_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_4x4_dspr2) +/* + * a0 = compptr->dct_table + * a1 = coef_block + * a2 = output_buf + * a3 = output_col + * 16(sp) = workspace[DCTSIZE*4] (buffers data between passes) + */ + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw v1, 48(sp) + move t0, a1 + move t1, v1 + li t9, 4 + li s0, 0x2e75f93e + li s1, 0x21f9ba79 + li s2, 0xecc2efb0 + li s3, 0x52031ccd + +0: + lh s6, 32(t0) /* inptr[DCTSIZE*2] */ + lh t6, 32(a0) /* quantptr[DCTSIZE*2] */ + lh s7, 96(t0) /* inptr[DCTSIZE*6] */ + lh t7, 96(a0) /* quantptr[DCTSIZE*6] */ + mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] * + quantptr[DCTSIZE*2]) */ + lh s4, 0(t0) /* inptr[DCTSIZE*0] */ + mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * + quantptr[DCTSIZE*6]) */ + lh s5, 0(a0) /* quantptr[0] */ + li s6, 15137 + li s7, 6270 + mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */ + mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] * + quantptr[DCTSIZE*2]) */ + lh t5, 112(t0) /* inptr[DCTSIZE*7] */ + mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * + quantptr[DCTSIZE*6]) */ + lh s4, 112(a0) /* quantptr[DCTSIZE*7] */ + lh v0, 80(t0) /* inptr[DCTSIZE*5] */ + lh s5, 80(a0) /* quantptr[DCTSIZE*5] */ + lh s6, 48(a0) /* quantptr[DCTSIZE*3] */ + sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */ + lh s7, 16(a0) /* quantptr[DCTSIZE*1] */ + lh t8, 16(t0) /* inptr[DCTSIZE*1] */ + subu t6, t6, t7 /* tmp2 = + MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */ + lh t7, 48(t0) /* inptr[DCTSIZE*3] */ + mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] * + quantptr[DCTSIZE*7]) */ + mul v0, s5, v0 /* z2 = (inptr[DCTSIZE*5] * + quantptr[DCTSIZE*5]) */ + mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] * + quantptr[DCTSIZE*3]) */ + mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] * + quantptr[DCTSIZE*1]) */ + addu t3, t2, t6 /* tmp10 = tmp0 + z2 */ + subu t4, t2, t6 /* tmp10 = tmp0 - z2 */ + mult $ac0, zero, zero + mult $ac1, zero, zero + ins t5, v0, 16, 16 + ins t7, t8, 16, 16 + addiu t9, t9, -1 + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + mflo s4, $ac0 + mflo s5, $ac1 + addiu a0, a0, 2 + addiu t1, t1, 4 + addiu t0, t0, 2 + addu t6, t4, s4 + subu t5, t4, s4 + addu s6, t3, s5 + subu s7, t3, s5 + shra_r.w t6, t6, 12 /* DESCALE(tmp12 + temp1, 12) */ + shra_r.w t5, t5, 12 /* DESCALE(tmp12 - temp1, 12) */ + shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */ + shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */ + sw t6, 28(t1) + sw t5, 60(t1) + sw s6, -4(t1) + bgtz t9, 0b + sw s7, 92(t1) + /* second loop three pass */ + li t9, 3 +1: + lh s6, 34(t0) /* inptr[DCTSIZE*2] */ + lh t6, 34(a0) /* quantptr[DCTSIZE*2] */ + lh s7, 98(t0) /* inptr[DCTSIZE*6] */ + lh t7, 98(a0) /* quantptr[DCTSIZE*6] */ + mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] * + quantptr[DCTSIZE*2]) */ + lh s4, 2(t0) /* inptr[DCTSIZE*0] */ + mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * + quantptr[DCTSIZE*6]) */ + lh s5, 2(a0) /* quantptr[DCTSIZE*0] */ + li s6, 15137 + li s7, 6270 + mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */ + mul v0, s6, t6 /* z2 = (inptr[DCTSIZE*2] * + quantptr[DCTSIZE*2]) */ + lh t5, 114(t0) /* inptr[DCTSIZE*7] */ + mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * + quantptr[DCTSIZE*6]) */ + lh s4, 114(a0) /* quantptr[DCTSIZE*7] */ + lh s5, 82(a0) /* quantptr[DCTSIZE*5] */ + lh t6, 82(t0) /* inptr[DCTSIZE*5] */ + sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */ + lh s6, 50(a0) /* quantptr[DCTSIZE*3] */ + lh t8, 18(t0) /* inptr[DCTSIZE*1] */ + subu v0, v0, t7 /* tmp2 = + MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */ + lh t7, 50(t0) /* inptr[DCTSIZE*3] */ + lh s7, 18(a0) /* quantptr[DCTSIZE*1] */ + mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] * + quantptr[DCTSIZE*7]) */ + mul t6, s5, t6 /* z2 = (inptr[DCTSIZE*5] * + quantptr[DCTSIZE*5]) */ + mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] * + quantptr[DCTSIZE*3]) */ + mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] * + quantptr[DCTSIZE*1]) */ + addu t3, t2, v0 /* tmp10 = tmp0 + z2 */ + subu t4, t2, v0 /* tmp10 = tmp0 - z2 */ + mult $ac0, zero, zero + mult $ac1, zero, zero + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + mflo t5, $ac0 + mflo t6, $ac1 + addiu t9, t9, -1 + addiu t0, t0, 2 + addiu a0, a0, 2 + addiu t1, t1, 4 + addu s5, t4, t5 + subu s4, t4, t5 + addu s6, t3, t6 + subu s7, t3, t6 + shra_r.w s5, s5, 12 /* DESCALE(tmp12 + temp1, 12) */ + shra_r.w s4, s4, 12 /* DESCALE(tmp12 - temp1, 12) */ + shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */ + shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */ + sw s5, 32(t1) + sw s4, 64(t1) + sw s6, 0(t1) + bgtz t9, 1b + sw s7, 96(t1) + move t1, v1 + li s4, 15137 + lw s6, 8(t1) /* wsptr[2] */ + li s5, 6270 + lw s7, 24(t1) /* wsptr[6] */ + mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], + FIX_1_847759065) */ + lw t2, 0(t1) /* wsptr[0] */ + mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], + -FIX_0_765366865) */ + lh t5, 28(t1) /* wsptr[7] */ + lh t6, 20(t1) /* wsptr[5] */ + lh t7, 12(t1) /* wsptr[3] */ + lh t8, 4(t1) /* wsptr[1] */ + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 /* tmp0 = + ((JLONG)wsptr[0]) << (CONST_BITS+1) */ + mflo s6, $ac0 + /* MULTIPLY(wsptr[2], FIX_1_847759065) + + MULTIPLY(wsptr[6], -FIX_0_765366865) */ + subu s4, s4, s5 + addu t3, t2, s4 /* tmp10 = tmp0 + z2 */ + mflo s7, $ac1 + subu t4, t2, s4 /* tmp10 = tmp0 - z2 */ + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */ + shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */ + shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */ + shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */ + sll s4, t9, 2 + lw v0, 0(a2) /* output_buf[ctr] */ + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + /* 2 */ + li s4, 15137 + lw s6, 40(t1) /* wsptr[2] */ + li s5, 6270 + lw s7, 56(t1) /* wsptr[6] */ + mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], + FIX_1_847759065) */ + lw t2, 32(t1) /* wsptr[0] */ + mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], + -FIX_0_765366865) */ + lh t5, 60(t1) /* wsptr[7] */ + lh t6, 52(t1) /* wsptr[5] */ + lh t7, 44(t1) /* wsptr[3] */ + lh t8, 36(t1) /* wsptr[1] */ + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 /* tmp0 = + ((JLONG)wsptr[0]) << (CONST_BITS+1) */ + mflo s6, $ac0 + /* MULTIPLY(wsptr[2], FIX_1_847759065) + + MULTIPLY(wsptr[6], -FIX_0_765366865) */ + subu s4, s4, s5 + addu t3, t2, s4 /* tmp10 = tmp0 + z2 */ + mflo s7, $ac1 + subu t4, t2, s4 /* tmp10 = tmp0 - z2 */ + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, + CONST_BITS-PASS1_BITS+1) */ + shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, + CONST_BITS-PASS1_BITS+1) */ + shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, + CONST_BITS-PASS1_BITS+1) */ + shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, + CONST_BITS-PASS1_BITS+1) */ + sll s4, t9, 2 + lw v0, 4(a2) /* output_buf[ctr] */ + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + /* 3 */ + li s4, 15137 + lw s6, 72(t1) /* wsptr[2] */ + li s5, 6270 + lw s7, 88(t1) /* wsptr[6] */ + mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], + FIX_1_847759065) */ + lw t2, 64(t1) /* wsptr[0] */ + mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], + -FIX_0_765366865) */ + lh t5, 92(t1) /* wsptr[7] */ + lh t6, 84(t1) /* wsptr[5] */ + lh t7, 76(t1) /* wsptr[3] */ + lh t8, 68(t1) /* wsptr[1] */ + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 /* tmp0 = + ((JLONG)wsptr[0]) << (CONST_BITS+1) */ + mflo s6, $ac0 + /* MULTIPLY(wsptr[2], FIX_1_847759065) + + MULTIPLY(wsptr[6], -FIX_0_765366865) */ + subu s4, s4, s5 + addu t3, t2, s4 /* tmp10 = tmp0 + z2 */ + mflo s7, $ac1 + subu t4, t2, s4 /* tmp10 = tmp0 - z2 */ + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */ + shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */ + shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */ + shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */ + sll s4, t9, 2 + lw v0, 8(a2) /* output_buf[ctr] */ + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + li s4, 15137 + lw s6, 104(t1) /* wsptr[2] */ + li s5, 6270 + lw s7, 120(t1) /* wsptr[6] */ + mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], + FIX_1_847759065) */ + lw t2, 96(t1) /* wsptr[0] */ + mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], + -FIX_0_765366865) */ + lh t5, 124(t1) /* wsptr[7] */ + lh t6, 116(t1) /* wsptr[5] */ + lh t7, 108(t1) /* wsptr[3] */ + lh t8, 100(t1) /* wsptr[1] */ + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 /* tmp0 = + ((JLONG)wsptr[0]) << (CONST_BITS+1) */ + mflo s6, $ac0 + /* MULTIPLY(wsptr[2], FIX_1_847759065) + + MULTIPLY(wsptr[6], -FIX_0_765366865) */ + subu s4, s4, s5 + addu t3, t2, s4 /* tmp10 = tmp0 + z2; */ + mflo s7, $ac1 + subu t4, t2, s4 /* tmp10 = tmp0 - z2; */ + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */ + shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */ + shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */ + shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */ + sll s4, t9, 2 + lw v0, 12(a2) /* output_buf[ctr] */ + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_idct_4x4_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_6x6_dspr2) +/* + * a0 = compptr->dct_table + * a1 = coef_block + * a2 = output_buf + * a3 = output_col + */ + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu sp, sp, -144 + move v0, sp + addiu v1, v0, 24 + addiu t9, zero, 5793 + addiu s0, zero, 10033 + addiu s1, zero, 2998 + +1: + lh s2, 0(a0) /* q0 = quantptr[ 0] */ + lh s3, 32(a0) /* q1 = quantptr[16] */ + lh s4, 64(a0) /* q2 = quantptr[32] */ + lh t2, 64(a1) /* tmp2 = inptr[32] */ + lh t1, 32(a1) /* tmp1 = inptr[16] */ + lh t0, 0(a1) /* tmp0 = inptr[ 0] */ + mul t2, t2, s4 /* tmp2 = tmp2 * q2 */ + mul t1, t1, s3 /* tmp1 = tmp1 * q1 */ + mul t0, t0, s2 /* tmp0 = tmp0 * q0 */ + lh t6, 16(a1) /* z1 = inptr[ 8] */ + lh t8, 80(a1) /* z3 = inptr[40] */ + lh t7, 48(a1) /* z2 = inptr[24] */ + lh s2, 16(a0) /* q0 = quantptr[ 8] */ + lh s4, 80(a0) /* q2 = quantptr[40] */ + lh s3, 48(a0) /* q1 = quantptr[24] */ + mul t2, t2, t9 /* tmp2 = tmp2 * 5793 */ + mul t1, t1, s0 /* tmp1 = tmp1 * 10033 */ + sll t0, t0, 13 /* tmp0 = tmp0 << 13 */ + mul t6, t6, s2 /* z1 = z1 * q0 */ + mul t8, t8, s4 /* z3 = z3 * q2 */ + mul t7, t7, s3 /* z2 = z2 * q1 */ + addu t3, t0, t2 /* tmp10 = tmp0 + tmp2 */ + sll t2, t2, 1 /* tmp2 = tmp2 << 2 */ + subu t4, t0, t2 /* tmp11 = tmp0 - tmp2; */ + subu t5, t3, t1 /* tmp12 = tmp10 - tmp1 */ + addu t3, t3, t1 /* tmp10 = tmp10 + tmp1 */ + addu t1, t6, t8 /* tmp1 = z1 + z3 */ + mul t1, t1, s1 /* tmp1 = tmp1 * 2998 */ + shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */ + subu t2, t6, t8 /* tmp2 = z1 - z3 */ + subu t2, t2, t7 /* tmp2 = tmp2 - z2 */ + sll t2, t2, 2 /* tmp2 = tmp2 << 2 */ + addu t0, t6, t7 /* tmp0 = z1 + z2 */ + sll t0, t0, 13 /* tmp0 = tmp0 << 13 */ + subu s2, t8, t7 /* q0 = z3 - z2 */ + sll s2, s2, 13 /* q0 = q0 << 13 */ + addu t0, t0, t1 /* tmp0 = tmp0 + tmp1 */ + addu t1, s2, t1 /* tmp1 = q0 + tmp1 */ + addu s2, t4, t2 /* q0 = tmp11 + tmp2 */ + subu s3, t4, t2 /* q1 = tmp11 - tmp2 */ + addu t6, t3, t0 /* z1 = tmp10 + tmp0 */ + subu t7, t3, t0 /* z2 = tmp10 - tmp0 */ + addu t4, t5, t1 /* tmp11 = tmp12 + tmp1 */ + subu t5, t5, t1 /* tmp12 = tmp12 - tmp1 */ + shra_r.w t6, t6, 11 /* z1 = (z1 + 1024) >> 11 */ + shra_r.w t7, t7, 11 /* z2 = (z2 + 1024) >> 11 */ + shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */ + shra_r.w t5, t5, 11 /* tmp12 = (tmp12 + 1024) >> 11 */ + sw s2, 24(v0) + sw s3, 96(v0) + sw t6, 0(v0) + sw t7, 120(v0) + sw t4, 48(v0) + sw t5, 72(v0) + addiu v0, v0, 4 + addiu a1, a1, 2 + bne v0, v1, 1b + addiu a0, a0, 2 + + /* Pass 2: process 6 rows from work array, store into output array. */ + move v0, sp + addiu v1, v0, 144 + +2: + lw t0, 0(v0) + lw t2, 16(v0) + lw s5, 0(a2) + addiu t0, t0, 16 + sll t0, t0, 13 + mul t3, t2, t9 + lw t6, 4(v0) + lw t8, 20(v0) + lw t7, 12(v0) + addu s5, s5, a3 + addu s6, t6, t8 + mul s6, s6, s1 + addu t1, t0, t3 + subu t4, t0, t3 + subu t4, t4, t3 + lw t3, 8(v0) + mul t0, t3, s0 + addu s7, t6, t7 + sll s7, s7, 13 + addu s7, s6, s7 + subu t2, t8, t7 + sll t2, t2, 13 + addu t2, s6, t2 + subu s6, t6, t7 + subu s6, s6, t8 + sll s6, s6, 13 + addu t3, t1, t0 + subu t5, t1, t0 + addu t6, t3, s7 + subu t3, t3, s7 + addu t7, t4, s6 + subu t4, t4, s6 + addu t8, t5, t2 + subu t5, t5, t2 + shll_s.w t6, t6, 6 + shll_s.w t3, t3, 6 + shll_s.w t7, t7, 6 + shll_s.w t4, t4, 6 + shll_s.w t8, t8, 6 + shll_s.w t5, t5, 6 + sra t6, t6, 24 + addiu t6, t6, 128 + sra t3, t3, 24 + addiu t3, t3, 128 + sb t6, 0(s5) + sra t7, t7, 24 + addiu t7, t7, 128 + sb t3, 5(s5) + sra t4, t4, 24 + addiu t4, t4, 128 + sb t7, 1(s5) + sra t8, t8, 24 + addiu t8, t8, 128 + sb t4, 4(s5) + addiu v0, v0, 24 + sra t5, t5, 24 + addiu t5, t5, 128 + sb t8, 2(s5) + addiu a2, a2, 4 + bne v0, v1, 2b + sb t5, 3(s5) + + addiu sp, sp, 144 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_6x6_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2) +/* + * a0 = compptr->dct_table + * a1 = coef_block + * a2 = workspace + */ + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + li a3, 8 + +1: + /* odd part */ + lh t0, 48(a1) + lh t1, 48(a0) + lh t2, 16(a1) + lh t3, 16(a0) + lh t4, 80(a1) + lh t5, 80(a0) + lh t6, 112(a1) + lh t7, 112(a0) + mul t0, t0, t1 /* z2 */ + mul t1, t2, t3 /* z1 */ + mul t2, t4, t5 /* z3 */ + mul t3, t6, t7 /* z4 */ + li t4, 10703 /* FIX(1.306562965) */ + li t5, 4433 /* FIX_0_541196100 */ + li t6, 7053 /* FIX(0.860918669) */ + mul t4, t0, t4 /* tmp11 */ + mul t5, t0, t5 /* -tmp14 */ + addu t7, t1, t2 /* tmp10 */ + addu t8, t7, t3 /* tmp10 + z4 */ + mul t6, t6, t8 /* tmp15 */ + li t8, 2139 /* FIX(0.261052384) */ + mul t8, t7, t8 /* MULTIPLY(tmp10, FIX(0.261052384)) */ + li t7, 2295 /* FIX(0.280143716) */ + mul t7, t1, t7 /* MULTIPLY(z1, FIX(0.280143716)) */ + addu t9, t2, t3 /* z3 + z4 */ + li s0, 8565 /* FIX(1.045510580) */ + mul t9, t9, s0 /* -tmp13 */ + li s0, 12112 /* FIX(1.478575242) */ + mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242) */ + li s1, 12998 /* FIX(1.586706681) */ + mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */ + li s2, 5540 /* FIX(0.676326758) */ + mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */ + li s3, 16244 /* FIX(1.982889723) */ + mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */ + subu t1, t1, t3 /* z1-=z4 */ + subu t0, t0, t2 /* z2-=z3 */ + addu t2, t0, t1 /* z1+z2 */ + li t3, 4433 /* FIX_0_541196100 */ + mul t2, t2, t3 /* z3 */ + li t3, 6270 /* FIX_0_765366865 */ + mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */ + li t3, 15137 /* FIX_0_765366865 */ + mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */ + addu t8, t6, t8 /* tmp12 */ + addu t3, t8, t4 /* tmp12 + tmp11 */ + addu t3, t3, t7 /* tmp10 */ + subu t8, t8, t9 /* tmp12 + tmp13 */ + addu s0, t5, s0 + subu t8, t8, s0 /* tmp12 */ + subu t9, t6, t9 + subu s1, s1, t4 + addu t9, t9, s1 /* tmp13 */ + subu t6, t6, t5 + subu t6, t6, s2 + subu t6, t6, s3 /* tmp15 */ + /* even part start */ + lh t4, 64(a1) + lh t5, 64(a0) + lh t7, 32(a1) + lh s0, 32(a0) + lh s1, 0(a1) + lh s2, 0(a0) + lh s3, 96(a1) + lh v0, 96(a0) + mul t4, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*4], + quantptr[DCTSIZE*4]) */ + mul t5, t7, s0 /* DEQUANTIZE(inptr[DCTSIZE*2], + quantptr[DCTSIZE*2]) */ + mul t7, s1, s2 /* DEQUANTIZE(inptr[DCTSIZE*0], + quantptr[DCTSIZE*0]) */ + mul s0, s3, v0 /* DEQUANTIZE(inptr[DCTSIZE*6], + quantptr[DCTSIZE*6]) */ + /* odd part end */ + addu t1, t2, t1 /* tmp11 */ + subu t0, t2, t0 /* tmp14 */ + /* update counter and pointers */ + addiu a3, a3, -1 + addiu a0, a0, 2 + addiu a1, a1, 2 + /* even part rest */ + li s1, 10033 + li s2, 11190 + mul t4, t4, s1 /* z4 */ + mul s1, t5, s2 /* z4 */ + sll t5, t5, 13 /* z1 */ + sll t7, t7, 13 + addiu t7, t7, 1024 /* z3 */ + sll s0, s0, 13 /* z2 */ + addu s2, t7, t4 /* tmp10 */ + subu t4, t7, t4 /* tmp11 */ + subu s3, t5, s0 /* tmp12 */ + addu t2, t7, s3 /* tmp21 */ + subu s3, t7, s3 /* tmp24 */ + addu t7, s1, s0 /* tmp12 */ + addu v0, s2, t7 /* tmp20 */ + subu s2, s2, t7 /* tmp25 */ + subu s1, s1, t5 /* z4 - z1 */ + subu s1, s1, s0 /* tmp12 */ + addu s0, t4, s1 /* tmp22 */ + subu t4, t4, s1 /* tmp23 */ + /* final output stage */ + addu t5, v0, t3 + subu v0, v0, t3 + addu t3, t2, t1 + subu t2, t2, t1 + addu t1, s0, t8 + subu s0, s0, t8 + addu t8, t4, t9 + subu t4, t4, t9 + addu t9, s3, t0 + subu s3, s3, t0 + addu t0, s2, t6 + subu s2, s2, t6 + sra t5, t5, 11 + sra t3, t3, 11 + sra t1, t1, 11 + sra t8, t8, 11 + sra t9, t9, 11 + sra t0, t0, 11 + sra s2, s2, 11 + sra s3, s3, 11 + sra t4, t4, 11 + sra s0, s0, 11 + sra t2, t2, 11 + sra v0, v0, 11 + sw t5, 0(a2) + sw t3, 32(a2) + sw t1, 64(a2) + sw t8, 96(a2) + sw t9, 128(a2) + sw t0, 160(a2) + sw s2, 192(a2) + sw s3, 224(a2) + sw t4, 256(a2) + sw s0, 288(a2) + sw t2, 320(a2) + sw v0, 352(a2) + bgtz a3, 1b + addiu a2, a2, 4 + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop + +END(jsimd_idct_12x12_pass1_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2) +/* + * a0 = workspace + * a1 = output + */ + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + li a3, 12 + +1: + /* Odd part */ + lw t0, 12(a0) + lw t1, 4(a0) + lw t2, 20(a0) + lw t3, 28(a0) + li t4, 10703 /* FIX(1.306562965) */ + li t5, 4433 /* FIX_0_541196100 */ + mul t4, t0, t4 /* tmp11 */ + mul t5, t0, t5 /* -tmp14 */ + addu t6, t1, t2 /* tmp10 */ + li t7, 2139 /* FIX(0.261052384) */ + mul t7, t6, t7 /* MULTIPLY(tmp10, FIX(0.261052384)) */ + addu t6, t6, t3 /* tmp10 + z4 */ + li t8, 7053 /* FIX(0.860918669) */ + mul t6, t6, t8 /* tmp15 */ + li t8, 2295 /* FIX(0.280143716) */ + mul t8, t1, t8 /* MULTIPLY(z1, FIX(0.280143716)) */ + addu t9, t2, t3 /* z3 + z4 */ + li s0, 8565 /* FIX(1.045510580) */ + mul t9, t9, s0 /* -tmp13 */ + li s0, 12112 /* FIX(1.478575242) */ + mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242)) */ + li s1, 12998 /* FIX(1.586706681) */ + mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */ + li s2, 5540 /* FIX(0.676326758) */ + mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */ + li s3, 16244 /* FIX(1.982889723) */ + mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */ + subu t1, t1, t3 /* z1 -= z4 */ + subu t0, t0, t2 /* z2 -= z3 */ + addu t2, t1, t0 /* z1 + z2 */ + li t3, 4433 /* FIX_0_541196100 */ + mul t2, t2, t3 /* z3 */ + li t3, 6270 /* FIX_0_765366865 */ + mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */ + li t3, 15137 /* FIX_1_847759065 */ + mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */ + addu t3, t6, t7 /* tmp12 */ + addu t7, t3, t4 + addu t7, t7, t8 /* tmp10 */ + subu t3, t3, t9 + subu t3, t3, t5 + subu t3, t3, s0 /* tmp12 */ + subu t9, t6, t9 + subu t9, t9, t4 + addu t9, t9, s1 /* tmp13 */ + subu t6, t6, t5 + subu t6, t6, s2 + subu t6, t6, s3 /* tmp15 */ + addu t1, t2, t1 /* tmp11 */ + subu t0, t2, t0 /* tmp14 */ + /* even part */ + lw t2, 16(a0) /* z4 */ + lw t4, 8(a0) /* z1 */ + lw t5, 0(a0) /* z3 */ + lw t8, 24(a0) /* z2 */ + li s0, 10033 /* FIX(1.224744871) */ + li s1, 11190 /* FIX(1.366025404) */ + mul t2, t2, s0 /* z4 */ + mul s0, t4, s1 /* z4 */ + addiu t5, t5, 0x10 + sll t5, t5, 13 /* z3 */ + sll t4, t4, 13 /* z1 */ + sll t8, t8, 13 /* z2 */ + subu s1, t4, t8 /* tmp12 */ + addu s2, t5, t2 /* tmp10 */ + subu t2, t5, t2 /* tmp11 */ + addu s3, t5, s1 /* tmp21 */ + subu s1, t5, s1 /* tmp24 */ + addu t5, s0, t8 /* tmp12 */ + addu v0, s2, t5 /* tmp20 */ + subu t5, s2, t5 /* tmp25 */ + subu t4, s0, t4 + subu t4, t4, t8 /* tmp12 */ + addu t8, t2, t4 /* tmp22 */ + subu t2, t2, t4 /* tmp23 */ + /* increment counter and pointers */ + addiu a3, a3, -1 + addiu a0, a0, 32 + /* Final stage */ + addu t4, v0, t7 + subu v0, v0, t7 + addu t7, s3, t1 + subu s3, s3, t1 + addu t1, t8, t3 + subu t8, t8, t3 + addu t3, t2, t9 + subu t2, t2, t9 + addu t9, s1, t0 + subu s1, s1, t0 + addu t0, t5, t6 + subu t5, t5, t6 + sll t4, t4, 4 + sll t7, t7, 4 + sll t1, t1, 4 + sll t3, t3, 4 + sll t9, t9, 4 + sll t0, t0, 4 + sll t5, t5, 4 + sll s1, s1, 4 + sll t2, t2, 4 + sll t8, t8, 4 + sll s3, s3, 4 + sll v0, v0, 4 + shll_s.w t4, t4, 2 + shll_s.w t7, t7, 2 + shll_s.w t1, t1, 2 + shll_s.w t3, t3, 2 + shll_s.w t9, t9, 2 + shll_s.w t0, t0, 2 + shll_s.w t5, t5, 2 + shll_s.w s1, s1, 2 + shll_s.w t2, t2, 2 + shll_s.w t8, t8, 2 + shll_s.w s3, s3, 2 + shll_s.w v0, v0, 2 + srl t4, t4, 24 + srl t7, t7, 24 + srl t1, t1, 24 + srl t3, t3, 24 + srl t9, t9, 24 + srl t0, t0, 24 + srl t5, t5, 24 + srl s1, s1, 24 + srl t2, t2, 24 + srl t8, t8, 24 + srl s3, s3, 24 + srl v0, v0, 24 + lw t6, 0(a1) + addiu t4, t4, 0x80 + addiu t7, t7, 0x80 + addiu t1, t1, 0x80 + addiu t3, t3, 0x80 + addiu t9, t9, 0x80 + addiu t0, t0, 0x80 + addiu t5, t5, 0x80 + addiu s1, s1, 0x80 + addiu t2, t2, 0x80 + addiu t8, t8, 0x80 + addiu s3, s3, 0x80 + addiu v0, v0, 0x80 + sb t4, 0(t6) + sb t7, 1(t6) + sb t1, 2(t6) + sb t3, 3(t6) + sb t9, 4(t6) + sb t0, 5(t6) + sb t5, 6(t6) + sb s1, 7(t6) + sb t2, 8(t6) + sb t8, 9(t6) + sb s3, 10(t6) + sb v0, 11(t6) + bgtz a3, 1b + addiu a1, a1, 4 + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + jr ra + nop + +END(jsimd_idct_12x12_pass2_dspr2) + + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_convsamp_dspr2) +/* + * a0 = sample_data + * a1 = start_col + * a2 = workspace + */ + lw t0, 0(a0) + li t7, 0xff80ff80 + addu t0, t0, a1 + ulw t1, 0(t0) + ulw t2, 4(t0) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + lw t0, 4(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 0(a2) + usw t4, 4(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 8(a2) + usw t6, 12(a2) + + lw t0, 8(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 16(a2) + usw t4, 20(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 24(a2) + usw t6, 28(a2) + + lw t0, 12(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 32(a2) + usw t4, 36(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 40(a2) + usw t6, 44(a2) + + lw t0, 16(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 48(a2) + usw t4, 52(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 56(a2) + usw t6, 60(a2) + + lw t0, 20(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 64(a2) + usw t4, 68(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 72(a2) + usw t6, 76(a2) + + lw t0, 24(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 80(a2) + usw t4, 84(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 88(a2) + usw t6, 92(a2) + + lw t0, 28(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 96(a2) + usw t4, 100(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 104(a2) + usw t6, 108(a2) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 112(a2) + usw t4, 116(a2) + usw t5, 120(a2) + usw t6, 124(a2) + + j ra + nop + +END(jsimd_convsamp_dspr2) + + +#ifndef __mips_soft_float + +/*****************************************************************************/ +LEAF_DSPR2(jsimd_convsamp_float_dspr2) +/* + * a0 = sample_data + * a1 = start_col + * a2 = workspace + */ + .set at + + lw t0, 0(a0) + addu t0, t0, a1 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 4(a0) + swc1 f2, 0(a2) + swc1 f4, 4(a2) + swc1 f6, 8(a2) + addu t0, t0, a1 + swc1 f8, 12(a2) + swc1 f10, 16(a2) + swc1 f12, 20(a2) + swc1 f14, 24(a2) + swc1 f16, 28(a2) + /* elemr 1 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 8(a0) + swc1 f2, 32(a2) + swc1 f4, 36(a2) + swc1 f6, 40(a2) + addu t0, t0, a1 + swc1 f8, 44(a2) + swc1 f10, 48(a2) + swc1 f12, 52(a2) + swc1 f14, 56(a2) + swc1 f16, 60(a2) + /* elemr 2 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 12(a0) + swc1 f2, 64(a2) + swc1 f4, 68(a2) + swc1 f6, 72(a2) + addu t0, t0, a1 + swc1 f8, 76(a2) + swc1 f10, 80(a2) + swc1 f12, 84(a2) + swc1 f14, 88(a2) + swc1 f16, 92(a2) + /* elemr 3 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 16(a0) + swc1 f2, 96(a2) + swc1 f4, 100(a2) + swc1 f6, 104(a2) + addu t0, t0, a1 + swc1 f8, 108(a2) + swc1 f10, 112(a2) + swc1 f12, 116(a2) + swc1 f14, 120(a2) + swc1 f16, 124(a2) + /* elemr 4 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 20(a0) + swc1 f2, 128(a2) + swc1 f4, 132(a2) + swc1 f6, 136(a2) + addu t0, t0, a1 + swc1 f8, 140(a2) + swc1 f10, 144(a2) + swc1 f12, 148(a2) + swc1 f14, 152(a2) + swc1 f16, 156(a2) + /* elemr 5 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 24(a0) + swc1 f2, 160(a2) + swc1 f4, 164(a2) + swc1 f6, 168(a2) + addu t0, t0, a1 + swc1 f8, 172(a2) + swc1 f10, 176(a2) + swc1 f12, 180(a2) + swc1 f14, 184(a2) + swc1 f16, 188(a2) + /* elemr 6 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 28(a0) + swc1 f2, 192(a2) + swc1 f4, 196(a2) + swc1 f6, 200(a2) + addu t0, t0, a1 + swc1 f8, 204(a2) + swc1 f10, 208(a2) + swc1 f12, 212(a2) + swc1 f14, 216(a2) + swc1 f16, 220(a2) + /* elemr 7 */ + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + swc1 f2, 224(a2) + swc1 f4, 228(a2) + swc1 f6, 232(a2) + swc1 f8, 236(a2) + swc1 f10, 240(a2) + swc1 f12, 244(a2) + swc1 f14, 248(a2) + swc1 f16, 252(a2) + + j ra + nop + +END(jsimd_convsamp_float_dspr2) + +#endif + +/*****************************************************************************/ diff --git a/media/libjpeg/simd/mips/jsimd_dspr2_asm.h b/media/libjpeg/simd/mips/jsimd_dspr2_asm.h new file mode 100644 index 0000000000..12cfda486c --- /dev/null +++ b/media/libjpeg/simd/mips/jsimd_dspr2_asm.h @@ -0,0 +1,292 @@ +/* + * MIPS DSPr2 optimizations for libjpeg-turbo + * + * Copyright (C) 2013, MIPS Technologies, Inc., California. + * Copyright (C) 2018, Matthieu Darbois. + * All Rights Reserved. + * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com) + * Darko Laus (darko.laus@imgtec.com) + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define zero $0 +#define AT $1 +#define v0 $2 +#define v1 $3 +#define a0 $4 +#define a1 $5 +#define a2 $6 +#define a3 $7 +#define t0 $8 +#define t1 $9 +#define t2 $10 +#define t3 $11 +#define t4 $12 +#define t5 $13 +#define t6 $14 +#define t7 $15 +#define s0 $16 +#define s1 $17 +#define s2 $18 +#define s3 $19 +#define s4 $20 +#define s5 $21 +#define s6 $22 +#define s7 $23 +#define t8 $24 +#define t9 $25 +#define k0 $26 +#define k1 $27 +#define gp $28 +#define sp $29 +#define fp $30 +#define s8 $30 +#define ra $31 + +#define f0 $f0 +#define f1 $f1 +#define f2 $f2 +#define f3 $f3 +#define f4 $f4 +#define f5 $f5 +#define f6 $f6 +#define f7 $f7 +#define f8 $f8 +#define f9 $f9 +#define f10 $f10 +#define f11 $f11 +#define f12 $f12 +#define f13 $f13 +#define f14 $f14 +#define f15 $f15 +#define f16 $f16 +#define f17 $f17 +#define f18 $f18 +#define f19 $f19 +#define f20 $f20 +#define f21 $f21 +#define f22 $f22 +#define f23 $f23 +#define f24 $f24 +#define f25 $f25 +#define f26 $f26 +#define f27 $f27 +#define f28 $f28 +#define f29 $f29 +#define f30 $f30 +#define f31 $f31 + +#ifdef __ELF__ +#define HIDDEN_SYMBOL(symbol) .hidden symbol; +#else +#define HIDDEN_SYMBOL(symbol) +#endif + +/* + * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2 + */ +#define LEAF_MIPS32R2(symbol) \ + .globl symbol; \ + HIDDEN_SYMBOL(symbol) \ + .align 2; \ + .type symbol, @function; \ + .ent symbol, 0; \ +symbol: \ + .frame sp, 0, ra; \ + .set push; \ + .set arch = mips32r2; \ + .set noreorder; \ + .set noat; + +/* + * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2 + */ +#define LEAF_DSPR2(symbol) \ +LEAF_MIPS32R2(symbol) \ + .set dspr2; + +/* + * END - mark end of function + */ +#define END(function) \ + .set pop; \ + .end function; \ + .size function, .-function + +/* + * Checks if stack offset is big enough for storing/restoring regs_num + * number of register to/from stack. Stack offset must be greater than + * or equal to the number of bytes needed for storing registers (regs_num*4). + * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is + * preserved for input arguments of the functions, already stored in a0-a3), + * stack size can be further optimized by utilizing this space. + */ +.macro CHECK_STACK_OFFSET regs_num, stack_offset +.if \stack_offset < \regs_num * 4 - 16 +.error "Stack offset too small." +.endif +.endm + +/* + * Saves set of registers on stack. Maximum number of registers that + * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * before registers are pushed in order to provide enough space on stack + * (offset must be multiple of 4, and must be big enough, as described by + * CHECK_STACK_OFFSET macro). This macro is intended to be used in + * combination with RESTORE_REGS_FROM_STACK macro. Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 +.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) + .error "Stack offset must be pozitive and multiple of 4." +.endif +.if \stack_offset != 0 + addiu sp, sp, -\stack_offset +.endif + sw \r1, 0(sp) +.if \r2 != 0 + sw \r2, 4(sp) +.endif +.if \r3 != 0 + sw \r3, 8(sp) +.endif +.if \r4 != 0 + sw \r4, 12(sp) +.endif +.if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + sw \r5, 16(sp) +.endif +.if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + sw \r6, 20(sp) +.endif +.if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + sw \r7, 24(sp) +.endif +.if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + sw \r8, 28(sp) +.endif +.if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + sw \r9, 32(sp) +.endif +.if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + sw \r10, 36(sp) +.endif +.if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + sw \r11, 40(sp) +.endif +.if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + sw \r12, 44(sp) +.endif +.if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + sw \r13, 48(sp) +.endif +.if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + sw \r14, 52(sp) +.endif +.endm + +/* + * Restores set of registers from stack. Maximum number of registers that + * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * after registers are restored (offset must be multiple of 4, and must + * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is + * intended to be used in combination with RESTORE_REGS_FROM_STACK macro. + * Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 +.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) + .error "Stack offset must be pozitive and multiple of 4." +.endif + lw \r1, 0(sp) +.if \r2 != 0 + lw \r2, 4(sp) +.endif +.if \r3 != 0 + lw \r3, 8(sp) +.endif +.if \r4 != 0 + lw \r4, 12(sp) +.endif +.if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + lw \r5, 16(sp) +.endif +.if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + lw \r6, 20(sp) +.endif +.if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + lw \r7, 24(sp) +.endif +.if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + lw \r8, 28(sp) +.endif +.if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + lw \r9, 32(sp) +.endif +.if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + lw \r10, 36(sp) +.endif +.if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + lw \r11, 40(sp) +.endif +.if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + lw \r12, 44(sp) +.endif +.if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + lw \r13, 48(sp) +.endif +.if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + lw \r14, 52(sp) +.endif +.if \stack_offset != 0 + addiu sp, sp, \stack_offset +.endif +.endm diff --git a/media/libjpeg/simd/nasm/jcolsamp.inc b/media/libjpeg/simd/nasm/jcolsamp.inc new file mode 100644 index 0000000000..6f6d7f29d1 --- /dev/null +++ b/media/libjpeg/simd/nasm/jcolsamp.inc @@ -0,0 +1,135 @@ +; +; jcolsamp.inc - private declarations for color conversion & up/downsampling +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc + +; -------------------------------------------------------------------------- + +; pseudo-resisters to make ordering of RGB configurable +; +%if RGB_RED == 0 +%define mmA mm0 +%define mmB mm1 +%define xmmA xmm0 +%define xmmB xmm1 +%define ymmA ymm0 +%define ymmB ymm1 +%elif RGB_GREEN == 0 +%define mmA mm2 +%define mmB mm3 +%define xmmA xmm2 +%define xmmB xmm3 +%define ymmA ymm2 +%define ymmB ymm3 +%elif RGB_BLUE == 0 +%define mmA mm4 +%define mmB mm5 +%define xmmA xmm4 +%define xmmB xmm5 +%define ymmA ymm4 +%define ymmB ymm5 +%else +%define mmA mm6 +%define mmB mm7 +%define xmmA xmm6 +%define xmmB xmm7 +%define ymmA ymm6 +%define ymmB ymm7 +%endif + +%if RGB_RED == 1 +%define mmC mm0 +%define mmD mm1 +%define xmmC xmm0 +%define xmmD xmm1 +%define ymmC ymm0 +%define ymmD ymm1 +%elif RGB_GREEN == 1 +%define mmC mm2 +%define mmD mm3 +%define xmmC xmm2 +%define xmmD xmm3 +%define ymmC ymm2 +%define ymmD ymm3 +%elif RGB_BLUE == 1 +%define mmC mm4 +%define mmD mm5 +%define xmmC xmm4 +%define xmmD xmm5 +%define ymmC ymm4 +%define ymmD ymm5 +%else +%define mmC mm6 +%define mmD mm7 +%define xmmC xmm6 +%define xmmD xmm7 +%define ymmC ymm6 +%define ymmD ymm7 +%endif + +%if RGB_RED == 2 +%define mmE mm0 +%define mmF mm1 +%define xmmE xmm0 +%define xmmF xmm1 +%define ymmE ymm0 +%define ymmF ymm1 +%elif RGB_GREEN == 2 +%define mmE mm2 +%define mmF mm3 +%define xmmE xmm2 +%define xmmF xmm3 +%define ymmE ymm2 +%define ymmF ymm3 +%elif RGB_BLUE == 2 +%define mmE mm4 +%define mmF mm5 +%define xmmE xmm4 +%define xmmF xmm5 +%define ymmE ymm4 +%define ymmF ymm5 +%else +%define mmE mm6 +%define mmF mm7 +%define xmmE xmm6 +%define xmmF xmm7 +%define ymmE ymm6 +%define ymmF ymm7 +%endif + +%if RGB_RED == 3 +%define mmG mm0 +%define mmH mm1 +%define xmmG xmm0 +%define xmmH xmm1 +%define ymmG ymm0 +%define ymmH ymm1 +%elif RGB_GREEN == 3 +%define mmG mm2 +%define mmH mm3 +%define xmmG xmm2 +%define xmmH xmm3 +%define ymmG ymm2 +%define ymmH ymm3 +%elif RGB_BLUE == 3 +%define mmG mm4 +%define mmH mm5 +%define xmmG xmm4 +%define xmmH xmm5 +%define ymmG ymm4 +%define ymmH ymm5 +%else +%define mmG mm6 +%define mmH mm7 +%define xmmG xmm6 +%define xmmH xmm7 +%define ymmG ymm6 +%define ymmH ymm7 +%endif + +; -------------------------------------------------------------------------- diff --git a/media/libjpeg/simd/nasm/jdct.inc b/media/libjpeg/simd/nasm/jdct.inc new file mode 100644 index 0000000000..9192f66f0c --- /dev/null +++ b/media/libjpeg/simd/nasm/jdct.inc @@ -0,0 +1,31 @@ +; +; jdct.inc - private declarations for forward & reverse DCT subsystems +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2018, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc + +; Each IDCT routine is responsible for range-limiting its results and +; converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could +; be quite far out of range if the input data is corrupt, so a bulletproof +; range-limiting step is required. We use a mask-and-table-lookup method +; to do the combined operations quickly. +; +%define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples + +%define ROW(n, b, s) ((b) + (n) * (s)) +%define COL(n, b, s) ((b) + (n) * (s) * DCTSIZE) + +%define DWBLOCK(m, n, b, s) \ + ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_DWORD) +%define MMBLOCK(m, n, b, s) \ + ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_MMWORD) +%define XMMBLOCK(m, n, b, s) \ + ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_XMMWORD) +%define YMMBLOCK(m, n, b, s) \ + ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_YMMWORD) + +; -------------------------------------------------------------------------- diff --git a/media/libjpeg/simd/nasm/jpeg_nbits_table.inc b/media/libjpeg/simd/nasm/jpeg_nbits_table.inc new file mode 100644 index 0000000000..2ce6c284d9 --- /dev/null +++ b/media/libjpeg/simd/nasm/jpeg_nbits_table.inc @@ -0,0 +1,4097 @@ +jpeg_nbits_table db \ + 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, \ + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, \ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, \ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, \ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 diff --git a/media/libjpeg/simd/nasm/jsimdcfg.inc b/media/libjpeg/simd/nasm/jsimdcfg.inc new file mode 100644 index 0000000000..667024a5f9 --- /dev/null +++ b/media/libjpeg/simd/nasm/jsimdcfg.inc @@ -0,0 +1,93 @@ +; +; Automatically generated include file from jsimdcfg.inc.h +; +; +; -- jpeglib.h +; +%define DCTSIZE 8 +%define DCTSIZE2 64 +; +; -- jmorecfg.h +; +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 3 +%define EXT_RGB_RED 0 +%define EXT_RGB_GREEN 1 +%define EXT_RGB_BLUE 2 +%define EXT_RGB_PIXELSIZE 3 +%define EXT_RGBX_RED 0 +%define EXT_RGBX_GREEN 1 +%define EXT_RGBX_BLUE 2 +%define EXT_RGBX_PIXELSIZE 4 +%define EXT_BGR_RED 2 +%define EXT_BGR_GREEN 1 +%define EXT_BGR_BLUE 0 +%define EXT_BGR_PIXELSIZE 3 +%define EXT_BGRX_RED 2 +%define EXT_BGRX_GREEN 1 +%define EXT_BGRX_BLUE 0 +%define EXT_BGRX_PIXELSIZE 4 +%define EXT_XBGR_RED 3 +%define EXT_XBGR_GREEN 2 +%define EXT_XBGR_BLUE 1 +%define EXT_XBGR_PIXELSIZE 4 +%define EXT_XRGB_RED 1 +%define EXT_XRGB_GREEN 2 +%define EXT_XRGB_BLUE 3 +%define EXT_XRGB_PIXELSIZE 4 +%define RGBX_FILLER_0XFF 1 +; Representation of a single sample (pixel element value). +; On this SIMD implementation, this must be 'unsigned char'. +; +%define JSAMPLE byte ; unsigned char +%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) +%define CENTERJSAMPLE 128 +; Representation of a DCT frequency coefficient. +; On this SIMD implementation, this must be 'short'. +; +%define JCOEF word ; short +%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) +; Datatype used for image dimensions. +; On this SIMD implementation, this must be 'unsigned int'. +; +%define JDIMENSION dword ; unsigned int +%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) +%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h) +%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) +%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) +%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h) +%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) +%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) +%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) +%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) +; +; -- jdct.h +; +; A forward DCT routine is given a pointer to a work area of type DCTELEM[]; +; the DCT is to be performed in-place in that buffer. +; To maximize parallelism, Type DCTELEM is changed to short (originally, int). +; +%define DCTELEM word ; short +%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) +%define float FP32 ; float +%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float) +; To maximize parallelism, Type short is changed to short. +; +%define ISLOW_MULT_TYPE word ; must be short +%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) +%define IFAST_MULT_TYPE word ; must be short +%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) +%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors +%define FLOAT_MULT_TYPE FP32 ; must be float +%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) +; +; -- jsimd.h +; +%define JSIMD_NONE 0x00 +%define JSIMD_MMX 0x01 +%define JSIMD_3DNOW 0x02 +%define JSIMD_SSE 0x04 +%define JSIMD_SSE2 0x08 +%define JSIMD_AVX2 0x80 diff --git a/media/libjpeg/simd/nasm/jsimdext.inc b/media/libjpeg/simd/nasm/jsimdext.inc new file mode 100644 index 0000000000..9930d80c2a --- /dev/null +++ b/media/libjpeg/simd/nasm/jsimdext.inc @@ -0,0 +1,479 @@ +; +; jsimdext.inc - common declarations +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2010, 2016, 2019, D. R. Commander. +; Copyright (C) 2018, Matthieu Darbois. +; +; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 +; +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + +; ========================================================================== +; System-dependent configurations + +%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- +; * Microsoft Visual C++ +; * MinGW (Minimalist GNU for Windows) +; * CygWin +; * LCC-Win32 + +; -- segment definition -- +; +%ifdef __YASM_VER__ +%define SEG_TEXT .text align=32 +%define SEG_CONST .rdata align=32 +%else +%define SEG_TEXT .text align=32 public use32 class=CODE +%define SEG_CONST .rdata align=32 public use32 class=CONST +%endif + +%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- +; * Microsoft Visual C++ + +; -- segment definition -- +; +%ifdef __YASM_VER__ +%define SEG_TEXT .text align=32 +%define SEG_CONST .rdata align=32 +%else +%define SEG_TEXT .text align=32 public use64 class=CODE +%define SEG_CONST .rdata align=32 public use64 class=CONST +%endif +%define EXTN(name) name ; foo() -> foo + +%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- +; * Borland C++ (Win32) + +; -- segment definition -- +; +%define SEG_TEXT _text align=32 public use32 class=CODE +%define SEG_CONST _data align=32 public use32 class=DATA + +%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ +; * Linux +; * *BSD family Unix using elf format +; * Unix System V, including Solaris x86, UnixWare and SCO Unix + +; mark stack as non-executable +section .note.GNU-stack noalloc noexec nowrite progbits + +; -- segment definition -- +; +%ifdef __x86_64__ +%define SEG_TEXT .text progbits align=32 +%define SEG_CONST .rodata progbits align=32 +%else +%define SEG_TEXT .text progbits alloc exec nowrite align=32 +%define SEG_CONST .rodata progbits alloc noexec nowrite align=32 +%endif + +; To make the code position-independent, append -DPIC to the commandline +; +%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC +%define EXTN(name) name ; foo() -> foo + +%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- +; * Older Linux using a.out format (nasm -f aout -DAOUT ...) +; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) + +; -- segment definition -- +; +%define SEG_TEXT .text +%define SEG_CONST .data + +; To make the code position-independent, append -DPIC to the commandline +; +%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC + +%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- +; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) + +; -- segment definition -- +; +%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why? +%define SEG_CONST .rodata align=32 + +; The generation of position-independent code (PIC) is the default on Darwin. +; +%define PIC +%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing + +%else ; ----(Other case)---------------------- + +; -- segment definition -- +; +%define SEG_TEXT .text +%define SEG_CONST .data + +%endif ; ---------------------------------------------- + +; ========================================================================== + +; -------------------------------------------------------------------------- +; Common types +; +%ifdef __x86_64__ +%define POINTER qword ; general pointer type +%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) +%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT +%else +%define POINTER dword ; general pointer type +%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) +%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT +%endif + +%define INT dword ; signed integer type +%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) +%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT + +%define FP32 dword ; IEEE754 single +%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) +%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT + +%define MMWORD qword ; int64 (MMX register) +%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) +%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT + +; NASM is buggy and doesn't properly handle operand sizes for SSE +; instructions, so for now we have to define XMMWORD as blank. +%define XMMWORD ; int128 (SSE register) +%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) +%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT + +%define YMMWORD ; int256 (AVX register) +%define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD) +%define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT + +; Similar hacks for when we load a dword or MMWORD into an xmm# register +%define XMM_DWORD +%define XMM_MMWORD + +%define SIZEOF_BYTE 1 ; sizeof(byte) +%define SIZEOF_WORD 2 ; sizeof(word) +%define SIZEOF_DWORD 4 ; sizeof(dword) +%define SIZEOF_QWORD 8 ; sizeof(qword) +%define SIZEOF_OWORD 16 ; sizeof(oword) +%define SIZEOF_YWORD 32 ; sizeof(yword) + +%define BYTE_BIT 8 ; CHAR_BIT in C +%define WORD_BIT 16 ; sizeof(word)*BYTE_BIT +%define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT +%define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT +%define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT +%define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT + +; -------------------------------------------------------------------------- +; External Symbol Name +; +%ifndef EXTN +%define EXTN(name) _ %+ name ; foo() -> _foo +%endif + +; -------------------------------------------------------------------------- +; Hidden symbols +; +%ifdef ELF ; ----(nasm -felf[64] -DELF ...)-------- +%define GLOBAL_FUNCTION(name) global EXTN(name):function hidden +%define GLOBAL_DATA(name) global EXTN(name):data hidden +%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- +%ifdef __YASM_VER__ +%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern +%define GLOBAL_DATA(name) global EXTN(name):private_extern +%else +%if __NASM_VERSION_ID__ >= 0x020E0000 +%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern +%define GLOBAL_DATA(name) global EXTN(name):private_extern +%endif +%endif +%endif + +%ifndef GLOBAL_FUNCTION +%define GLOBAL_FUNCTION(name) global EXTN(name) +%endif +%ifndef GLOBAL_DATA +%define GLOBAL_DATA(name) global EXTN(name) +%endif + +; -------------------------------------------------------------------------- +; Macros for position-independent code (PIC) support +; +%ifndef GOT_SYMBOL +%undef PIC +%endif + +%ifdef PIC ; ------------------------------------------- + +%ifidn GOT_SYMBOL, _MACHO_PIC_ ; -------------------- + +; At present, nasm doesn't seem to support PIC generation for Mach-O. +; The PIC support code below is a little tricky. + + SECTION SEG_CONST +const_base: + +%define GOTOFF(got, sym) (got) + (sym) - const_base + +%imacro get_GOT 1 + ; NOTE: this macro destroys ecx resister. + call %%geteip + add ecx, byte (%%ref - $) + jmp short %%adjust +%%geteip: + mov ecx, POINTER [esp] + ret +%%adjust: + push ebp + xor ebp, ebp ; ebp = 0 +%ifidni %1, ebx ; (%1 == ebx) + ; db 0x8D,0x9C + jmp near const_base = + ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) + db 0x8D, 0x9C ; 8D,9C + jmp near const_base ; E9,(const_base-%%ref) +%%ref: +%else ; (%1 != ebx) + ; db 0x8D,0x8C + jmp near const_base = + ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) + db 0x8D, 0x8C ; 8D,8C + jmp near const_base ; E9,(const_base-%%ref) +%%ref: + mov %1, ecx +%endif ; (%1 == ebx) + pop ebp +%endmacro + +%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- + +%define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff + +%imacro get_GOT 1 + extern GOT_SYMBOL + call %%geteip + add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc + jmp short %%done +%%geteip: + mov %1, POINTER [esp] + ret +%%done: +%endmacro + +%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- + +%imacro pushpic 1.nolist + push %1 +%endmacro +%imacro poppic 1.nolist + pop %1 +%endmacro +%imacro movpic 2.nolist + mov %1, %2 +%endmacro + +%else ; !PIC ----------------------------------------- + +%define GOTOFF(got, sym) (sym) + +%imacro get_GOT 1.nolist +%endmacro +%imacro pushpic 1.nolist +%endmacro +%imacro poppic 1.nolist +%endmacro +%imacro movpic 2.nolist +%endmacro + +%endif ; PIC ----------------------------------------- + +; -------------------------------------------------------------------------- +; Align the next instruction on {2,4,8,16,..}-byte boundary. +; ".balign n,,m" in GNU as +; +%define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) +%define FILLB(b, n) (($$-(b)) & ((n)-1)) + +%imacro alignx 1-2.nolist 0xFFFF +%%bs: \ + times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \ + db 0x90 ; nop + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \ + db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \ + db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \ + db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \ + db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \ + db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \ + db 0x8B, 0xED ; mov ebp,ebp + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \ + db 0x90 ; nop +%endmacro + +; Align the next data on {2,4,8,16,..}-byte boundary. +; +%imacro alignz 1.nolist + align %1, db 0 ; filling zeros +%endmacro + +%ifdef __x86_64__ + +%ifdef WIN64 + +%imacro collect_args 1 + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm6 + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm7 + mov r10, rcx +%if %1 > 1 + mov r11, rdx +%endif +%if %1 > 2 + push r12 + mov r12, r8 +%endif +%if %1 > 3 + push r13 + mov r13, r9 +%endif +%if %1 > 4 + push r14 + mov r14, [rax+48] +%endif +%if %1 > 5 + push r15 + mov r15, [rax+56] +%endif + push rsi + push rdi +%endmacro + +%imacro uncollect_args 1 + pop rdi + pop rsi +%if %1 > 5 + pop r15 +%endif +%if %1 > 4 + pop r14 +%endif +%if %1 > 3 + pop r13 +%endif +%if %1 > 2 + pop r12 +%endif + movaps xmm7, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD + movaps xmm6, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD +%endmacro + +%imacro push_xmm 1 + sub rsp, %1 * SIZEOF_XMMWORD + movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8 +%if %1 > 1 + movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9 +%endif +%if %1 > 2 + movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10 +%endif +%if %1 > 3 + movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11 +%endif +%endmacro + +%imacro pop_xmm 1 + movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD] +%if %1 > 1 + movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD] +%endif +%if %1 > 2 + movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD] +%endif +%if %1 > 3 + movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD] +%endif + add rsp, %1 * SIZEOF_XMMWORD +%endmacro + +%else + +%imacro collect_args 1 + push r10 + mov r10, rdi +%if %1 > 1 + push r11 + mov r11, rsi +%endif +%if %1 > 2 + push r12 + mov r12, rdx +%endif +%if %1 > 3 + push r13 + mov r13, rcx +%endif +%if %1 > 4 + push r14 + mov r14, r8 +%endif +%if %1 > 5 + push r15 + mov r15, r9 +%endif +%endmacro + +%imacro uncollect_args 1 +%if %1 > 5 + pop r15 +%endif +%if %1 > 4 + pop r14 +%endif +%if %1 > 3 + pop r13 +%endif +%if %1 > 2 + pop r12 +%endif +%if %1 > 1 + pop r11 +%endif + pop r10 +%endmacro + +%imacro push_xmm 1 +%endmacro + +%imacro pop_xmm 1 +%endmacro + +%endif + +%endif + +; -------------------------------------------------------------------------- +; Defines picked up from the C headers +; +%include "jsimdcfg.inc" + +; -------------------------------------------------------------------------- diff --git a/media/libjpeg/simd/powerpc/jccolext-altivec.c b/media/libjpeg/simd/powerpc/jccolext-altivec.c new file mode 100644 index 0000000000..170f90ff80 --- /dev/null +++ b/media/libjpeg/simd/powerpc/jccolext-altivec.c @@ -0,0 +1,269 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2014, Jay Foad. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jccolor-altivec.c */ + + +void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + JSAMPROW inptr, outptr0, outptr1, outptr2; + int pitch = img_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; +#endif + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; + + __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 }, + rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr; +#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 + __vector unsigned char rgb3 = { 0 }; +#endif +#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 + __vector unsigned char rgb4 = { 0 }; +#endif + __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; + __vector unsigned short yl, yh, crl, crh, cbl, cbh; + __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3; + + /* Constants */ + __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) }, + pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) }, + pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) }, + pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) }; + __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }, + pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +#else + shift_pack_index = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; +#endif + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + + for (num_cols = pitch; num_cols > 0; + num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, + outptr0 += 16, outptr1 += 16, outptr2 += 16) { + +#if __BIG_ENDIAN__ + /* Load 16 pixels == 48 or 64 bytes */ + offset = (size_t)inptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overread. Since there is no way to + * read a partial AltiVec register, overread would occur on the last + * chunk of the last image row if the right edge is not on a 16-byte + * boundary. It could also occur on other rows if the bytes per row + * is low enough. Since we can't determine whether we're on the last + * image row, we have to assume every row is the last. + */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_ld(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (bytes > 16) + rgb1 = vec_ld(16, inptr); + if (bytes > 32) + rgb2 = vec_ld(32, inptr); + if (bytes > 48) + rgb3 = vec_ld(48, inptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + rgb4 = vec_ld(64, inptr); +#endif + unaligned_shift_index = vec_lvsl(0, inptr); + rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); + rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); + rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); +#endif + } + } else { +#endif /* __BIG_ENDIAN__ */ + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = VEC_LD(0, tmpbuf); + rgb1 = VEC_LD(16, tmpbuf); + rgb2 = VEC_LD(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = VEC_LD(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = VEC_LD(0, inptr); + if (num_cols > 16) + rgb1 = VEC_LD(16, inptr); + if (num_cols > 32) + rgb2 = VEC_LD(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = VEC_LD(48, inptr); +#endif + } +#if __BIG_ENDIAN__ + } +#endif + +#if RGB_PIXELSIZE == 3 + /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0); + rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1); + rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2); + rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3); +#else + /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX); + rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX); + rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX); + rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX); +#endif + + /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3 + * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 + * ... + * + * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); + bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); + rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); + bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); + rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); + bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); + rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); + bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); + + /* (Original) + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + * + * (This implementation) + * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + */ + + /* Calculate Y values */ + + y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf); + y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf); + y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf); + y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf); + y0 = vec_msums(bg0, pw_f0114_f0250, y0); + y1 = vec_msums(bg1, pw_f0114_f0250, y1); + y2 = vec_msums(bg2, pw_f0114_f0250, y2); + y3 = vec_msums(bg3, pw_f0114_f0250, y3); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1, + shift_pack_index); + yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3, + shift_pack_index); + y = vec_pack(yl, yh); + vec_st(y, 0, outptr0); + + /* Calculate Cb values */ + cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj); + cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj); + cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj); + cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj); + cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000, + (__vector unsigned int)cb0); + cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000, + (__vector unsigned int)cb1); + cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000, + (__vector unsigned int)cb2); + cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000, + (__vector unsigned int)cb3); + cbl = vec_perm((__vector unsigned short)cb0, + (__vector unsigned short)cb1, shift_pack_index); + cbh = vec_perm((__vector unsigned short)cb2, + (__vector unsigned short)cb3, shift_pack_index); + cb = vec_pack(cbl, cbh); + vec_st(cb, 0, outptr1); + + /* Calculate Cr values */ + cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj); + cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj); + cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj); + cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj); + cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000, + (__vector unsigned int)cr0); + cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000, + (__vector unsigned int)cr1); + cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000, + (__vector unsigned int)cr2); + cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000, + (__vector unsigned int)cr3); + crl = vec_perm((__vector unsigned short)cr0, + (__vector unsigned short)cr1, shift_pack_index); + crh = vec_perm((__vector unsigned short)cr2, + (__vector unsigned short)cr3, shift_pack_index); + cr = vec_pack(crl, crh); + vec_st(cr, 0, outptr2); + } + } +} diff --git a/media/libjpeg/simd/powerpc/jccolor-altivec.c b/media/libjpeg/simd/powerpc/jccolor-altivec.c new file mode 100644 index 0000000000..d670dbcda3 --- /dev/null +++ b/media/libjpeg/simd/powerpc/jccolor-altivec.c @@ -0,0 +1,116 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* RGB --> YCC CONVERSION */ + +#include "jsimd_altivec.h" + + +#define F_0_081 5329 /* FIX(0.08131) */ +#define F_0_114 7471 /* FIX(0.11400) */ +#define F_0_168 11059 /* FIX(0.16874) */ +#define F_0_250 16384 /* FIX(0.25000) */ +#define F_0_299 19595 /* FIX(0.29900) */ +#define F_0_331 21709 /* FIX(0.33126) */ +#define F_0_418 27439 /* FIX(0.41869) */ +#define F_0_500 32768 /* FIX(0.50000) */ +#define F_0_587 38470 /* FIX(0.58700) */ +#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + + +#define RGBG_INDEX0 \ + { 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 } +#define RGBG_INDEX1 \ + { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 } +#define RGBG_INDEX2 \ + { 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 } +#define RGBG_INDEX3 \ + { 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 } +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGBG_INDEX \ + { 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 } +#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGBG_INDEX0 \ + { 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 } +#define RGBG_INDEX1 \ + { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 } +#define RGBG_INDEX2 \ + { 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 } +#define RGBG_INDEX3 \ + { 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 } +#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGBG_INDEX \ + { 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 } +#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGBG_INDEX \ + { 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 } +#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGBG_INDEX \ + { 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 } +#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec diff --git a/media/libjpeg/simd/powerpc/jcgray-altivec.c b/media/libjpeg/simd/powerpc/jcgray-altivec.c new file mode 100644 index 0000000000..a11a7e7021 --- /dev/null +++ b/media/libjpeg/simd/powerpc/jcgray-altivec.c @@ -0,0 +1,111 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* RGB --> GRAYSCALE CONVERSION */ + +#include "jsimd_altivec.h" + + +#define F_0_114 7471 /* FIX(0.11400) */ +#define F_0_250 16384 /* FIX(0.25000) */ +#define F_0_299 19595 /* FIX(0.29900) */ +#define F_0_587 38470 /* FIX(0.58700) */ +#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + + +#define RGBG_INDEX0 \ + { 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 } +#define RGBG_INDEX1 \ + { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 } +#define RGBG_INDEX2 \ + { 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 } +#define RGBG_INDEX3 \ + { 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 } +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGBG_INDEX \ + { 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 } +#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGBG_INDEX0 \ + { 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 } +#define RGBG_INDEX1 \ + { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 } +#define RGBG_INDEX2 \ + { 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 } +#define RGBG_INDEX3 \ + { 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 } +#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGBG_INDEX \ + { 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 } +#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGBG_INDEX \ + { 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 } +#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGBG_INDEX \ + { 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 } +#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec diff --git a/media/libjpeg/simd/powerpc/jcgryext-altivec.c b/media/libjpeg/simd/powerpc/jcgryext-altivec.c new file mode 100644 index 0000000000..b280cbbded --- /dev/null +++ b/media/libjpeg/simd/powerpc/jcgryext-altivec.c @@ -0,0 +1,228 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2014, Jay Foad. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jcgray-altivec.c */ + + +void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + JSAMPROW inptr, outptr; + int pitch = img_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; +#endif + + __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 }, + rgbg0, rgbg1, rgbg2, rgbg3, y; +#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 + __vector unsigned char rgb3 = { 0 }; +#endif +#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 + __vector unsigned char rgb4 = { 0 }; +#endif + __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; + __vector unsigned short yl, yh; + __vector int y0, y1, y2, y3; + + /* Constants */ + __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) }, + pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +#else + shift_pack_index = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; +#endif + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr = output_buf[0][output_row]; + output_row++; + + for (num_cols = pitch; num_cols > 0; + num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, + outptr += 16) { + +#if __BIG_ENDIAN__ + /* Load 16 pixels == 48 or 64 bytes */ + offset = (size_t)inptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overread. Since there is no way to + * read a partial AltiVec register, overread would occur on the last + * chunk of the last image row if the right edge is not on a 16-byte + * boundary. It could also occur on other rows if the bytes per row + * is low enough. Since we can't determine whether we're on the last + * image row, we have to assume every row is the last. + */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_ld(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (bytes > 16) + rgb1 = vec_ld(16, inptr); + if (bytes > 32) + rgb2 = vec_ld(32, inptr); + if (bytes > 48) + rgb3 = vec_ld(48, inptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + rgb4 = vec_ld(64, inptr); +#endif + unaligned_shift_index = vec_lvsl(0, inptr); + rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); + rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); + rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); +#endif + } + } else { + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_ld(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (num_cols > 16) + rgb1 = vec_ld(16, inptr); + if (num_cols > 32) + rgb2 = vec_ld(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = vec_ld(48, inptr); +#endif + } + } +#else + /* Little endian */ + rgb0 = vec_vsx_ld(0, inptr); + if (num_cols > 16) + rgb1 = vec_vsx_ld(16, inptr); + if (num_cols > 32) + rgb2 = vec_vsx_ld(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = vec_vsx_ld(48, inptr); +#endif +#endif + +#if RGB_PIXELSIZE == 3 + /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0); + rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1); + rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2); + rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3); +#else + /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX); + rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX); + rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX); + rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX); +#endif + + /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3 + * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 + * ... + * + * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); + bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); + rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); + bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); + rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); + bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); + rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); + bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); + + /* (Original) + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * + * (This implementation) + * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + */ + + /* Calculate Y values */ + + y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf); + y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf); + y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf); + y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf); + y0 = vec_msums(bg0, pw_f0114_f0250, y0); + y1 = vec_msums(bg1, pw_f0114_f0250, y1); + y2 = vec_msums(bg2, pw_f0114_f0250, y2); + y3 = vec_msums(bg3, pw_f0114_f0250, y3); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1, + shift_pack_index); + yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3, + shift_pack_index); + y = vec_pack(yl, yh); + vec_st(y, 0, outptr); + } + } +} diff --git a/media/libjpeg/simd/powerpc/jcsample-altivec.c b/media/libjpeg/simd/powerpc/jcsample-altivec.c new file mode 100644 index 0000000000..6e25b8db90 --- /dev/null +++ b/media/libjpeg/simd/powerpc/jcsample-altivec.c @@ -0,0 +1,159 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* CHROMA DOWNSAMPLING */ + +#include "jsimd_altivec.h" +#include "jcsample.h" + + +void jsimd_h2v1_downsample_altivec(JDIMENSION image_width, + int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, + JSAMPARRAY input_data, + JSAMPARRAY output_data) +{ + int outrow, outcol; + JDIMENSION output_cols = width_in_blocks * DCTSIZE; + JSAMPROW inptr, outptr; + + __vector unsigned char this0, next0, out; + __vector unsigned short this0e, this0o, next0e, next0o, outl, outh; + + /* Constants */ + __vector unsigned short pw_bias = { __4X2(0, 1) }, + pw_one = { __8X(1) }; + __vector unsigned char even_odd_index = + { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, + pb_zero = { __16X(0) }; + + expand_right_edge(input_data, max_v_samp_factor, image_width, + output_cols * 2); + + for (outrow = 0; outrow < v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr = input_data[outrow]; + + for (outcol = output_cols; outcol > 0; + outcol -= 16, inptr += 32, outptr += 16) { + + this0 = vec_ld(0, inptr); + this0 = vec_perm(this0, this0, even_odd_index); + this0e = (__vector unsigned short)VEC_UNPACKHU(this0); + this0o = (__vector unsigned short)VEC_UNPACKLU(this0); + outl = vec_add(this0e, this0o); + outl = vec_add(outl, pw_bias); + outl = vec_sr(outl, pw_one); + + if (outcol > 8) { + next0 = vec_ld(16, inptr); + next0 = vec_perm(next0, next0, even_odd_index); + next0e = (__vector unsigned short)VEC_UNPACKHU(next0); + next0o = (__vector unsigned short)VEC_UNPACKLU(next0); + outh = vec_add(next0e, next0o); + outh = vec_add(outh, pw_bias); + outh = vec_sr(outh, pw_one); + } else + outh = vec_splat_u16(0); + + out = vec_pack(outl, outh); + vec_st(out, 0, outptr); + } + } +} + + +void +jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int inrow, outrow, outcol; + JDIMENSION output_cols = width_in_blocks * DCTSIZE; + JSAMPROW inptr0, inptr1, outptr; + + __vector unsigned char this0, next0, this1, next1, out; + __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o, + next1e, next1o, out0l, out0h, out1l, out1h, outl, outh; + + /* Constants */ + __vector unsigned short pw_bias = { __4X2(1, 2) }, + pw_two = { __8X(2) }; + __vector unsigned char even_odd_index = + { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, + pb_zero = { __16X(0) }; + + expand_right_edge(input_data, max_v_samp_factor, image_width, + output_cols * 2); + + for (inrow = 0, outrow = 0; outrow < v_samp_factor; + inrow += 2, outrow++) { + + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow + 1]; + outptr = output_data[outrow]; + + for (outcol = output_cols; outcol > 0; + outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) { + + this0 = vec_ld(0, inptr0); + this0 = vec_perm(this0, this0, even_odd_index); + this0e = (__vector unsigned short)VEC_UNPACKHU(this0); + this0o = (__vector unsigned short)VEC_UNPACKLU(this0); + out0l = vec_add(this0e, this0o); + + this1 = vec_ld(0, inptr1); + this1 = vec_perm(this1, this1, even_odd_index); + this1e = (__vector unsigned short)VEC_UNPACKHU(this1); + this1o = (__vector unsigned short)VEC_UNPACKLU(this1); + out1l = vec_add(this1e, this1o); + + outl = vec_add(out0l, out1l); + outl = vec_add(outl, pw_bias); + outl = vec_sr(outl, pw_two); + + if (outcol > 8) { + next0 = vec_ld(16, inptr0); + next0 = vec_perm(next0, next0, even_odd_index); + next0e = (__vector unsigned short)VEC_UNPACKHU(next0); + next0o = (__vector unsigned short)VEC_UNPACKLU(next0); + out0h = vec_add(next0e, next0o); + + next1 = vec_ld(16, inptr1); + next1 = vec_perm(next1, next1, even_odd_index); + next1e = (__vector unsigned short)VEC_UNPACKHU(next1); + next1o = (__vector unsigned short)VEC_UNPACKLU(next1); + out1h = vec_add(next1e, next1o); + + outh = vec_add(out0h, out1h); + outh = vec_add(outh, pw_bias); + outh = vec_sr(outh, pw_two); + } else + outh = vec_splat_u16(0); + + out = vec_pack(outl, outh); + vec_st(out, 0, outptr); + } + } +} diff --git a/media/libjpeg/simd/powerpc/jcsample.h b/media/libjpeg/simd/powerpc/jcsample.h new file mode 100644 index 0000000000..2ac48167fc --- /dev/null +++ b/media/libjpeg/simd/powerpc/jcsample.h @@ -0,0 +1,28 @@ +/* + * jcsample.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + */ + +LOCAL(void) +expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols, + JDIMENSION output_cols) +{ + register JSAMPROW ptr; + register JSAMPLE pixval; + register int count; + int row; + int numcols = (int)(output_cols - input_cols); + + if (numcols > 0) { + for (row = 0; row < num_rows; row++) { + ptr = image_data[row] + input_cols; + pixval = ptr[-1]; /* don't need GETJSAMPLE() here */ + for (count = numcols; count > 0; count--) + *ptr++ = pixval; + } + } +} diff --git a/media/libjpeg/simd/powerpc/jdcolext-altivec.c b/media/libjpeg/simd/powerpc/jdcolext-altivec.c new file mode 100644 index 0000000000..68d52bd8a2 --- /dev/null +++ b/media/libjpeg/simd/powerpc/jdcolext-altivec.c @@ -0,0 +1,276 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdcolor-altivec.c */ + + +void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + JSAMPROW outptr, inptr0, inptr1, inptr2; + int pitch = out_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; +#endif + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; + + __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3, + y, cb, cr; +#if __BIG_ENDIAN__ + __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3; +#if RGB_PIXELSIZE == 4 + __vector unsigned char out4; +#endif +#endif +#if RGB_PIXELSIZE == 4 + __vector unsigned char rgb3; +#endif + __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh, + crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w; + __vector int g0, g1, g2, g3; + + /* Constants + * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 + * high-order bits, not 16. + */ + __vector short pw_f0402 = { __8X(F_0_402 >> 1) }, + pw_mf0228 = { __8X(-F_0_228 >> 1) }, + pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) }, + pw_one = { __8X(1) }, pw_255 = { __8X(255) }, + pw_cj = { __8X(CENTERJSAMPLE) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +#else + shift_pack_index = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; +#endif + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + + for (num_cols = pitch; num_cols > 0; + num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16, + inptr0 += 16, inptr1 += 16, inptr2 += 16) { + + y = vec_ld(0, inptr0); + /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + yl = (__vector signed short)VEC_UNPACKHU(y); + yh = (__vector signed short)VEC_UNPACKLU(y); + + cb = vec_ld(0, inptr1); + cbl = (__vector signed short)VEC_UNPACKHU(cb); + cbh = (__vector signed short)VEC_UNPACKLU(cb); + cbl = vec_sub(cbl, pw_cj); + cbh = vec_sub(cbh, pw_cj); + + cr = vec_ld(0, inptr2); + crl = (__vector signed short)VEC_UNPACKHU(cr); + crh = (__vector signed short)VEC_UNPACKLU(cr); + crl = vec_sub(crl, pw_cj); + crh = vec_sub(crh, pw_cj); + + /* (Original) + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb + * + * (This implementation) + * R = Y + 0.40200 * Cr + Cr + * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + * B = Y - 0.22800 * Cb + Cb + Cb + */ + bl = vec_add(cbl, cbl); + bh = vec_add(cbh, cbh); + bl = vec_madds(bl, pw_mf0228, pw_one); + bh = vec_madds(bh, pw_mf0228, pw_one); + bl = vec_sra(bl, (__vector unsigned short)pw_one); + bh = vec_sra(bh, (__vector unsigned short)pw_one); + bl = vec_add(bl, cbl); + bh = vec_add(bh, cbh); + bl = vec_add(bl, cbl); + bh = vec_add(bh, cbh); + bl = vec_add(bl, yl); + bh = vec_add(bh, yh); + + rl = vec_add(crl, crl); + rh = vec_add(crh, crh); + rl = vec_madds(rl, pw_f0402, pw_one); + rh = vec_madds(rh, pw_f0402, pw_one); + rl = vec_sra(rl, (__vector unsigned short)pw_one); + rh = vec_sra(rh, (__vector unsigned short)pw_one); + rl = vec_add(rl, crl); + rh = vec_add(rh, crh); + rl = vec_add(rl, yl); + rh = vec_add(rh, yh); + + g0w = vec_mergeh(cbl, crl); + g1w = vec_mergel(cbl, crl); + g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf); + g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf); + g2w = vec_mergeh(cbh, crh); + g3w = vec_mergel(cbh, crh); + g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf); + g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index); + gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index); + gl = vec_sub(gl, crl); + gh = vec_sub(gh, crh); + gl = vec_add(gl, yl); + gh = vec_add(gh, yh); + + rg0 = vec_mergeh(rl, gl); + bx0 = vec_mergeh(bl, pw_255); + rg1 = vec_mergel(rl, gl); + bx1 = vec_mergel(bl, pw_255); + rg2 = vec_mergeh(rh, gh); + bx2 = vec_mergeh(bh, pw_255); + rg3 = vec_mergel(rh, gh); + bx3 = vec_mergel(bh, pw_255); + + rgbx0 = vec_packsu(rg0, bx0); + rgbx1 = vec_packsu(rg1, bx1); + rgbx2 = vec_packsu(rg2, bx2); + rgbx3 = vec_packsu(rg3, bx3); + +#if RGB_PIXELSIZE == 3 + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + */ + rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0); + rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1); + rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2); +#else + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + */ + rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX); + rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX); + rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX); + rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX); +#endif + +#if __BIG_ENDIAN__ + offset = (size_t)outptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overwrite. Since there is no way to + * write a partial AltiVec register, overwrite would occur on the + * last chunk of the last image row if the right edge is not on a + * 16-byte boundary. It could also occur on other rows if the bytes + * per row is low enough. Since we can't determine whether we're on + * the last image row, we have to assume every row is the last. + */ + vec_st(rgb0, 0, tmpbuf); + vec_st(rgb1, 16, tmpbuf); + vec_st(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + vec_st(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + unaligned_shift_index = vec_lvsl(0, outptr); + edgel = vec_ld(0, outptr); + edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr); + edges = vec_perm(edgeh, edgel, unaligned_shift_index); + unaligned_shift_index = vec_lvsr(0, outptr); + out0 = vec_perm(edges, rgb0, unaligned_shift_index); + out1 = vec_perm(rgb0, rgb1, unaligned_shift_index); + out2 = vec_perm(rgb1, rgb2, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + out3 = vec_perm(rgb2, rgb3, unaligned_shift_index); + out4 = vec_perm(rgb3, edges, unaligned_shift_index); +#else + out3 = vec_perm(rgb2, edges, unaligned_shift_index); +#endif + vec_st(out0, 0, outptr); + if (bytes > 16) + vec_st(out1, 16, outptr); + if (bytes > 32) + vec_st(out2, 32, outptr); + if (bytes > 48) + vec_st(out3, 48, outptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + vec_st(out4, 64, outptr); +#endif + } + } else { +#endif /* __BIG_ENDIAN__ */ + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + VEC_ST(rgb0, 0, tmpbuf); + VEC_ST(rgb1, 16, tmpbuf); + VEC_ST(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + VEC_ST(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + VEC_ST(rgb0, 0, outptr); + if (num_cols > 16) + VEC_ST(rgb1, 16, outptr); + if (num_cols > 32) + VEC_ST(rgb2, 32, outptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + VEC_ST(rgb3, 48, outptr); +#endif + } +#if __BIG_ENDIAN__ + } +#endif + } + } +} diff --git a/media/libjpeg/simd/powerpc/jdcolor-altivec.c b/media/libjpeg/simd/powerpc/jdcolor-altivec.c new file mode 100644 index 0000000000..eb35b67176 --- /dev/null +++ b/media/libjpeg/simd/powerpc/jdcolor-altivec.c @@ -0,0 +1,106 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* YCC --> RGB CONVERSION */ + +#include "jsimd_altivec.h" + + +#define F_0_344 22554 /* FIX(0.34414) */ +#define F_0_714 46802 /* FIX(0.71414) */ +#define F_1_402 91881 /* FIX(1.40200) */ +#define F_1_772 116130 /* FIX(1.77200) */ +#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ +#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ +#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + +#define RGB_INDEX0 \ + { 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 } +#define RGB_INDEX1 \ + { 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 } +#define RGB_INDEX2 \ + { 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 } +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGB_INDEX \ + { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 } +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGB_INDEX0 \ + { 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 } +#define RGB_INDEX1 \ + { 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 } +#define RGB_INDEX2 \ + { 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 } +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGB_INDEX \ + { 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 } +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGB_INDEX \ + { 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 } +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGB_INDEX \ + { 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 } +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec diff --git a/media/libjpeg/simd/powerpc/jdmerge-altivec.c b/media/libjpeg/simd/powerpc/jdmerge-altivec.c new file mode 100644 index 0000000000..79c577f141 --- /dev/null +++ b/media/libjpeg/simd/powerpc/jdmerge-altivec.c @@ -0,0 +1,130 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */ + +#include "jsimd_altivec.h" + + +#define F_0_344 22554 /* FIX(0.34414) */ +#define F_0_714 46802 /* FIX(0.71414) */ +#define F_1_402 91881 /* FIX(1.40200) */ +#define F_1_772 116130 /* FIX(1.77200) */ +#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ +#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ +#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + +#define RGB_INDEX0 \ + { 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 } +#define RGB_INDEX1 \ + { 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 } +#define RGB_INDEX2 \ + { 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 } +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_h2v1_merged_upsample_altivec \ + jsimd_h2v1_extrgb_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec \ + jsimd_h2v2_extrgb_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGB_INDEX \ + { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 } +#define jsimd_h2v1_merged_upsample_altivec \ + jsimd_h2v1_extrgbx_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec \ + jsimd_h2v2_extrgbx_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGB_INDEX0 \ + { 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 } +#define RGB_INDEX1 \ + { 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 } +#define RGB_INDEX2 \ + { 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 } +#define jsimd_h2v1_merged_upsample_altivec \ + jsimd_h2v1_extbgr_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec \ + jsimd_h2v2_extbgr_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGB_INDEX \ + { 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 } +#define jsimd_h2v1_merged_upsample_altivec \ + jsimd_h2v1_extbgrx_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec \ + jsimd_h2v2_extbgrx_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGB_INDEX \ + { 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 } +#define jsimd_h2v1_merged_upsample_altivec \ + jsimd_h2v1_extxbgr_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec \ + jsimd_h2v2_extxbgr_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGB_INDEX \ + { 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 } +#define jsimd_h2v1_merged_upsample_altivec \ + jsimd_h2v1_extxrgb_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec \ + jsimd_h2v2_extxrgb_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec diff --git a/media/libjpeg/simd/powerpc/jdmrgext-altivec.c b/media/libjpeg/simd/powerpc/jdmrgext-altivec.c new file mode 100644 index 0000000000..40f02c33ea --- /dev/null +++ b/media/libjpeg/simd/powerpc/jdmrgext-altivec.c @@ -0,0 +1,329 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdmerge-altivec.c */ + + +void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW outptr, inptr0, inptr1, inptr2; + int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop; +#if __BIG_ENDIAN__ + int offset; +#endif + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; + + __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3, + y, cb, cr; +#if __BIG_ENDIAN__ + __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3; +#if RGB_PIXELSIZE == 4 + __vector unsigned char out4; +#endif +#endif +#if RGB_PIXELSIZE == 4 + __vector unsigned char rgb3; +#endif + __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh, + crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w, + rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo; + __vector int g_y0, g_y1, g_y2, g_y3; + + /* Constants + * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 + * high-order bits, not 16. + */ + __vector short pw_f0402 = { __8X(F_0_402 >> 1) }, + pw_mf0228 = { __8X(-F_0_228 >> 1) }, + pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) }, + pw_one = { __8X(1) }, pw_255 = { __8X(255) }, + pw_cj = { __8X(CENTERJSAMPLE) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }, + even_index = + { 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30 }, + odd_index = + { 0, 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31 }; +#else + shift_pack_index = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }, + even_index = + { 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30, 0 }, + odd_index = + { 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31, 0 }; +#endif + + inptr0 = input_buf[0][in_row_group_ctr]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr = output_buf[0]; + + for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) { + + cb = vec_ld(0, inptr1); + /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + cbl = (__vector signed short)VEC_UNPACKHU(cb); + cbh = (__vector signed short)VEC_UNPACKLU(cb); + cbl = vec_sub(cbl, pw_cj); + cbh = vec_sub(cbh, pw_cj); + + cr = vec_ld(0, inptr2); + crl = (__vector signed short)VEC_UNPACKHU(cr); + crh = (__vector signed short)VEC_UNPACKLU(cr); + crl = vec_sub(crl, pw_cj); + crh = vec_sub(crh, pw_cj); + + /* (Original) + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb + * + * (This implementation) + * R = Y + 0.40200 * Cr + Cr + * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + * B = Y - 0.22800 * Cb + Cb + Cb + */ + b_yl = vec_add(cbl, cbl); + b_yh = vec_add(cbh, cbh); + b_yl = vec_madds(b_yl, pw_mf0228, pw_one); + b_yh = vec_madds(b_yh, pw_mf0228, pw_one); + b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one); + b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one); + b_yl = vec_add(b_yl, cbl); + b_yh = vec_add(b_yh, cbh); + b_yl = vec_add(b_yl, cbl); + b_yh = vec_add(b_yh, cbh); + + r_yl = vec_add(crl, crl); + r_yh = vec_add(crh, crh); + r_yl = vec_madds(r_yl, pw_f0402, pw_one); + r_yh = vec_madds(r_yh, pw_f0402, pw_one); + r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one); + r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one); + r_yl = vec_add(r_yl, crl); + r_yh = vec_add(r_yh, crh); + + g_y0w = vec_mergeh(cbl, crl); + g_y1w = vec_mergel(cbl, crl); + g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf); + g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf); + g_y2w = vec_mergeh(cbh, crh); + g_y3w = vec_mergel(cbh, crh); + g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf); + g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1, + shift_pack_index); + g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3, + shift_pack_index); + g_yl = vec_sub(g_yl, crl); + g_yh = vec_sub(g_yh, crh); + + for (yloop = 0; yloop < 2 && num_cols > 0; yloop++, + num_cols -= RGB_PIXELSIZE * 16, + outptr += RGB_PIXELSIZE * 16, inptr0 += 16) { + + y = vec_ld(0, inptr0); + ye = (__vector signed short)vec_perm(pb_zero, y, even_index); + yo = (__vector signed short)vec_perm(pb_zero, y, odd_index); + + if (yloop == 0) { + be = vec_add(b_yl, ye); + bo = vec_add(b_yl, yo); + re = vec_add(r_yl, ye); + ro = vec_add(r_yl, yo); + ge = vec_add(g_yl, ye); + go = vec_add(g_yl, yo); + } else { + be = vec_add(b_yh, ye); + bo = vec_add(b_yh, yo); + re = vec_add(r_yh, ye); + ro = vec_add(r_yh, yo); + ge = vec_add(g_yh, ye); + go = vec_add(g_yh, yo); + } + + rl = vec_mergeh(re, ro); + rh = vec_mergel(re, ro); + gl = vec_mergeh(ge, go); + gh = vec_mergel(ge, go); + bl = vec_mergeh(be, bo); + bh = vec_mergel(be, bo); + + rg0 = vec_mergeh(rl, gl); + bx0 = vec_mergeh(bl, pw_255); + rg1 = vec_mergel(rl, gl); + bx1 = vec_mergel(bl, pw_255); + rg2 = vec_mergeh(rh, gh); + bx2 = vec_mergeh(bh, pw_255); + rg3 = vec_mergel(rh, gh); + bx3 = vec_mergel(bh, pw_255); + + rgbx0 = vec_packsu(rg0, bx0); + rgbx1 = vec_packsu(rg1, bx1); + rgbx2 = vec_packsu(rg2, bx2); + rgbx3 = vec_packsu(rg3, bx3); + +#if RGB_PIXELSIZE == 3 + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + */ + rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0); + rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1); + rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2); +#else + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + */ + rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX); + rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX); + rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX); + rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX); +#endif + +#if __BIG_ENDIAN__ + offset = (size_t)outptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overwrite. Since there is no way to + * write a partial AltiVec register, overwrite would occur on the + * last chunk of the last image row if the right edge is not on a + * 16-byte boundary. It could also occur on other rows if the bytes + * per row is low enough. Since we can't determine whether we're on + * the last image row, we have to assume every row is the last. + */ + vec_st(rgb0, 0, tmpbuf); + vec_st(rgb1, 16, tmpbuf); + vec_st(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + vec_st(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + unaligned_shift_index = vec_lvsl(0, outptr); + edgel = vec_ld(0, outptr); + edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr); + edges = vec_perm(edgeh, edgel, unaligned_shift_index); + unaligned_shift_index = vec_lvsr(0, outptr); + out0 = vec_perm(edges, rgb0, unaligned_shift_index); + out1 = vec_perm(rgb0, rgb1, unaligned_shift_index); + out2 = vec_perm(rgb1, rgb2, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + out3 = vec_perm(rgb2, rgb3, unaligned_shift_index); + out4 = vec_perm(rgb3, edges, unaligned_shift_index); +#else + out3 = vec_perm(rgb2, edges, unaligned_shift_index); +#endif + vec_st(out0, 0, outptr); + if (bytes > 16) + vec_st(out1, 16, outptr); + if (bytes > 32) + vec_st(out2, 32, outptr); + if (bytes > 48) + vec_st(out3, 48, outptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + vec_st(out4, 64, outptr); +#endif + } + } else { +#endif /* __BIG_ENDIAN__ */ + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + VEC_ST(rgb0, 0, tmpbuf); + VEC_ST(rgb1, 16, tmpbuf); + VEC_ST(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + VEC_ST(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + VEC_ST(rgb0, 0, outptr); + if (num_cols > 16) + VEC_ST(rgb1, 16, outptr); + if (num_cols > 32) + VEC_ST(rgb2, 32, outptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + VEC_ST(rgb3, 48, outptr); +#endif + } +#if __BIG_ENDIAN__ + } +#endif + } + } +} + + +void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW inptr, outptr; + + inptr = input_buf[0][in_row_group_ctr]; + outptr = output_buf[0]; + + input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2]; + jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr, + output_buf); + + input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1]; + output_buf[0] = output_buf[1]; + jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr, + output_buf); + + input_buf[0][in_row_group_ctr] = inptr; + output_buf[0] = outptr; +} diff --git a/media/libjpeg/simd/powerpc/jdsample-altivec.c b/media/libjpeg/simd/powerpc/jdsample-altivec.c new file mode 100644 index 0000000000..04df0cf108 --- /dev/null +++ b/media/libjpeg/simd/powerpc/jdsample-altivec.c @@ -0,0 +1,400 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* CHROMA UPSAMPLING */ + +#include "jsimd_altivec.h" + + +void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + int inrow, incol; + + __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0, + out; + __vector short this0e, this0o, this0l, this0h, last0l, last0h, + next0l, next0h, outle, outhe, outlo, outho; + + /* Constants */ + __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) }, + last_index_col0 = + { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + last_index = + { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }, + next_index = + { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }, + next_index_lastcol = + { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 }, +#if __BIG_ENDIAN__ + merge_pack_index = + { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }; +#else + merge_pack_index = + { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 }; +#endif + __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) }; + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + + if (downsampled_width & 15) + inptr[downsampled_width] = inptr[downsampled_width - 1]; + + this0 = vec_ld(0, inptr); + p_last0 = vec_perm(this0, this0, last_index_col0); + last0 = this0; + + for (incol = downsampled_width; incol > 0; + incol -= 16, inptr += 16, outptr += 32) { + + if (downsampled_width - incol > 0) { + p_last0 = vec_perm(last0, this0, last_index); + last0 = this0; + } + + if (incol <= 16) + p_next0 = vec_perm(this0, this0, next_index_lastcol); + else { + next0 = vec_ld(16, inptr); + p_next0 = vec_perm(this0, next0, next_index); + } + + this0e = (__vector short)vec_mule(this0, pb_three); + this0o = (__vector short)vec_mulo(this0, pb_three); + this0l = vec_mergeh(this0e, this0o); + this0h = vec_mergel(this0e, this0o); + + last0l = (__vector short)VEC_UNPACKHU(p_last0); + last0h = (__vector short)VEC_UNPACKLU(p_last0); + last0l = vec_add(last0l, pw_one); + + next0l = (__vector short)VEC_UNPACKHU(p_next0); + next0h = (__vector short)VEC_UNPACKLU(p_next0); + next0l = vec_add(next0l, pw_two); + + outle = vec_add(this0l, last0l); + outlo = vec_add(this0l, next0l); + outle = vec_sr(outle, (__vector unsigned short)pw_two); + outlo = vec_sr(outlo, (__vector unsigned short)pw_two); + + out = vec_perm((__vector unsigned char)outle, + (__vector unsigned char)outlo, merge_pack_index); + vec_st(out, 0, outptr); + + if (incol > 8) { + last0h = vec_add(last0h, pw_one); + next0h = vec_add(next0h, pw_two); + + outhe = vec_add(this0h, last0h); + outho = vec_add(this0h, next0h); + outhe = vec_sr(outhe, (__vector unsigned short)pw_two); + outho = vec_sr(outho, (__vector unsigned short)pw_two); + + out = vec_perm((__vector unsigned char)outhe, + (__vector unsigned char)outho, merge_pack_index); + vec_st(out, 16, outptr); + } + + this0 = next0; + } + } +} + + +void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; + int inrow, outrow, incol; + + __vector unsigned char this_1, this0, this1, out; + __vector short this_1l, this_1h, this0l, this0h, this1l, this1h, + lastcolsum_1h, lastcolsum1h, + p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h, + thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h, + nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 }, + nextcolsum1l = { 0 }, nextcolsum1h = { 0 }, + p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h, + tmpl, tmph, outle, outhe, outlo, outho; + + /* Constants */ + __vector unsigned char pb_zero = { __16X(0) }, + last_index_col0 = + { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, + last_index = + { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 }, + next_index = + { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 }, + next_index_lastcol = + { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15 }, +#if __BIG_ENDIAN__ + merge_pack_index = + { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }; +#else + merge_pack_index = + { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 }; +#endif + __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) }, + pw_seven = { __8X(7) }, pw_eight = { __8X(8) }; + __vector unsigned short pw_four = { __8X(4) }; + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + + inptr_1 = input_data[inrow - 1]; + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow + 1]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + if (downsampled_width & 15) { + inptr_1[downsampled_width] = inptr_1[downsampled_width - 1]; + inptr0[downsampled_width] = inptr0[downsampled_width - 1]; + inptr1[downsampled_width] = inptr1[downsampled_width - 1]; + } + + this0 = vec_ld(0, inptr0); + this0l = (__vector short)VEC_UNPACKHU(this0); + this0h = (__vector short)VEC_UNPACKLU(this0); + this0l = vec_mladd(this0l, pw_three, pw_zero); + this0h = vec_mladd(this0h, pw_three, pw_zero); + + this_1 = vec_ld(0, inptr_1); + this_1l = (__vector short)VEC_UNPACKHU(this_1); + this_1h = (__vector short)VEC_UNPACKLU(this_1); + thiscolsum_1l = vec_add(this0l, this_1l); + thiscolsum_1h = vec_add(this0h, this_1h); + lastcolsum_1h = thiscolsum_1h; + p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0); + p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); + + this1 = vec_ld(0, inptr1); + this1l = (__vector short)VEC_UNPACKHU(this1); + this1h = (__vector short)VEC_UNPACKLU(this1); + thiscolsum1l = vec_add(this0l, this1l); + thiscolsum1h = vec_add(this0h, this1h); + lastcolsum1h = thiscolsum1h; + p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0); + p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); + + for (incol = downsampled_width; incol > 0; + incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16, + outptr0 += 32, outptr1 += 32) { + + if (downsampled_width - incol > 0) { + p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index); + p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); + p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index); + p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); + lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h; + } + + if (incol <= 16) { + p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); + p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h, + next_index_lastcol); + p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); + p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h, + next_index_lastcol); + } else { + this0 = vec_ld(16, inptr0); + this0l = (__vector short)VEC_UNPACKHU(this0); + this0h = (__vector short)VEC_UNPACKLU(this0); + this0l = vec_mladd(this0l, pw_three, pw_zero); + this0h = vec_mladd(this0h, pw_three, pw_zero); + + this_1 = vec_ld(16, inptr_1); + this_1l = (__vector short)VEC_UNPACKHU(this_1); + this_1h = (__vector short)VEC_UNPACKLU(this_1); + nextcolsum_1l = vec_add(this0l, this_1l); + nextcolsum_1h = vec_add(this0h, this_1h); + p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); + p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index); + + this1 = vec_ld(16, inptr1); + this1l = (__vector short)VEC_UNPACKHU(this1); + this1h = (__vector short)VEC_UNPACKLU(this1); + nextcolsum1l = vec_add(this0l, this1l); + nextcolsum1h = vec_add(this0h, this1h); + p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); + p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index); + } + + /* Process the upper row */ + + tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero); + outle = vec_add(tmpl, p_lastcolsum_1l); + outle = vec_add(outle, pw_eight); + outle = vec_sr(outle, pw_four); + + outlo = vec_add(tmpl, p_nextcolsum_1l); + outlo = vec_add(outlo, pw_seven); + outlo = vec_sr(outlo, pw_four); + + out = vec_perm((__vector unsigned char)outle, + (__vector unsigned char)outlo, merge_pack_index); + vec_st(out, 0, outptr0); + + if (incol > 8) { + tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero); + outhe = vec_add(tmph, p_lastcolsum_1h); + outhe = vec_add(outhe, pw_eight); + outhe = vec_sr(outhe, pw_four); + + outho = vec_add(tmph, p_nextcolsum_1h); + outho = vec_add(outho, pw_seven); + outho = vec_sr(outho, pw_four); + + out = vec_perm((__vector unsigned char)outhe, + (__vector unsigned char)outho, merge_pack_index); + vec_st(out, 16, outptr0); + } + + /* Process the lower row */ + + tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero); + outle = vec_add(tmpl, p_lastcolsum1l); + outle = vec_add(outle, pw_eight); + outle = vec_sr(outle, pw_four); + + outlo = vec_add(tmpl, p_nextcolsum1l); + outlo = vec_add(outlo, pw_seven); + outlo = vec_sr(outlo, pw_four); + + out = vec_perm((__vector unsigned char)outle, + (__vector unsigned char)outlo, merge_pack_index); + vec_st(out, 0, outptr1); + + if (incol > 8) { + tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero); + outhe = vec_add(tmph, p_lastcolsum1h); + outhe = vec_add(outhe, pw_eight); + outhe = vec_sr(outhe, pw_four); + + outho = vec_add(tmph, p_nextcolsum1h); + outho = vec_add(outho, pw_seven); + outho = vec_sr(outho, pw_four); + + out = vec_perm((__vector unsigned char)outhe, + (__vector unsigned char)outho, merge_pack_index); + vec_st(out, 16, outptr1); + } + + thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h; + thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h; + } + } +} + + +/* These are rarely used (mainly just for decompressing YCCK images) */ + +void jsimd_h2v1_upsample_altivec(int max_v_samp_factor, + JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + int inrow, incol; + + __vector unsigned char in, inl, inh; + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + + for (incol = (output_width + 31) & (~31); incol > 0; + incol -= 64, inptr += 32, outptr += 64) { + + in = vec_ld(0, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 0, outptr); + vec_st(inh, 16, outptr); + + if (incol > 32) { + in = vec_ld(16, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 32, outptr); + vec_st(inh, 48, outptr); + } + } + } +} + + +void jsimd_h2v2_upsample_altivec(int max_v_samp_factor, + JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr0, outptr1; + int inrow, outrow, incol; + + __vector unsigned char in, inl, inh; + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + + inptr = input_data[inrow]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + for (incol = (output_width + 31) & (~31); incol > 0; + incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) { + + in = vec_ld(0, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 0, outptr0); + vec_st(inl, 0, outptr1); + + vec_st(inh, 16, outptr0); + vec_st(inh, 16, outptr1); + + if (incol > 32) { + in = vec_ld(16, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 32, outptr0); + vec_st(inl, 32, outptr1); + + vec_st(inh, 48, outptr0); + vec_st(inh, 48, outptr1); + } + } + } +} diff --git a/media/libjpeg/simd/powerpc/jfdctfst-altivec.c b/media/libjpeg/simd/powerpc/jfdctfst-altivec.c new file mode 100644 index 0000000000..ad9af81e0c --- /dev/null +++ b/media/libjpeg/simd/powerpc/jfdctfst-altivec.c @@ -0,0 +1,154 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* FAST INTEGER FORWARD DCT + * + * This is similar to the SSE2 implementation, except that we left-shift the + * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because + * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: + * the elements in arg3 + the most significant 17 bits of + * (the elements in arg1 * the elements in arg2). + */ + +#include "jsimd_altivec.h" + + +#define F_0_382 98 /* FIX(0.382683433) */ +#define F_0_541 139 /* FIX(0.541196100) */ +#define F_0_707 181 /* FIX(0.707106781) */ +#define F_1_306 334 /* FIX(1.306562965) */ + +#define CONST_BITS 8 +#define PRE_MULTIPLY_SCALE_BITS 2 +#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) + + +#define DO_FDCT() { \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out4 = vec_sub(tmp10, tmp11); \ + \ + z1 = vec_add(tmp12, tmp13); \ + z1 = vec_sl(z1, pre_multiply_scale_bits); \ + z1 = vec_madds(z1, pw_0707, pw_zero); \ + \ + out2 = vec_add(tmp13, z1); \ + out6 = vec_sub(tmp13, z1); \ + \ + /* Odd part */ \ + \ + tmp10 = vec_add(tmp4, tmp5); \ + tmp11 = vec_add(tmp5, tmp6); \ + tmp12 = vec_add(tmp6, tmp7); \ + \ + tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \ + tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ + z5 = vec_sub(tmp10, tmp12); \ + z5 = vec_madds(z5, pw_0382, pw_zero); \ + \ + z2 = vec_madds(tmp10, pw_0541, z5); \ + z4 = vec_madds(tmp12, pw_1306, z5); \ + \ + tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ + z3 = vec_madds(tmp11, pw_0707, pw_zero); \ + \ + z11 = vec_add(tmp7, z3); \ + z13 = vec_sub(tmp7, z3); \ + \ + out5 = vec_add(z13, z2); \ + out3 = vec_sub(z13, z2); \ + out1 = vec_add(z11, z4); \ + out7 = vec_sub(z11, z4); \ +} + + +void jsimd_fdct_ifast_altivec(DCTELEM *data) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + z1, z2, z3, z4, z5, z11, z13, + out0, out1, out2, out3, out4, out5, out6, out7; + + /* Constants */ + __vector short pw_zero = { __8X(0) }, + pw_0382 = { __8X(F_0_382 << CONST_SHIFT) }, + pw_0541 = { __8X(F_0_541 << CONST_SHIFT) }, + pw_0707 = { __8X(F_0_707 << CONST_SHIFT) }, + pw_1306 = { __8X(F_1_306 << CONST_SHIFT) }; + __vector unsigned short + pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }; + + /* Pass 1: process rows */ + + row0 = vec_ld(0, data); + row1 = vec_ld(16, data); + row2 = vec_ld(32, data); + row3 = vec_ld(48, data); + row4 = vec_ld(64, data); + row5 = vec_ld(80, data); + row6 = vec_ld(96, data); + row7 = vec_ld(112, data); + + TRANSPOSE(row, col); + + tmp0 = vec_add(col0, col7); + tmp7 = vec_sub(col0, col7); + tmp1 = vec_add(col1, col6); + tmp6 = vec_sub(col1, col6); + tmp2 = vec_add(col2, col5); + tmp5 = vec_sub(col2, col5); + tmp3 = vec_add(col3, col4); + tmp4 = vec_sub(col3, col4); + + DO_FDCT(); + + /* Pass 2: process columns */ + + TRANSPOSE(out, row); + + tmp0 = vec_add(row0, row7); + tmp7 = vec_sub(row0, row7); + tmp1 = vec_add(row1, row6); + tmp6 = vec_sub(row1, row6); + tmp2 = vec_add(row2, row5); + tmp5 = vec_sub(row2, row5); + tmp3 = vec_add(row3, row4); + tmp4 = vec_sub(row3, row4); + + DO_FDCT(); + + vec_st(out0, 0, data); + vec_st(out1, 16, data); + vec_st(out2, 32, data); + vec_st(out3, 48, data); + vec_st(out4, 64, data); + vec_st(out5, 80, data); + vec_st(out6, 96, data); + vec_st(out7, 112, data); +} diff --git a/media/libjpeg/simd/powerpc/jfdctint-altivec.c b/media/libjpeg/simd/powerpc/jfdctint-altivec.c new file mode 100644 index 0000000000..3d4f017103 --- /dev/null +++ b/media/libjpeg/simd/powerpc/jfdctint-altivec.c @@ -0,0 +1,258 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* ACCURATE INTEGER FORWARD DCT */ + +#include "jsimd_altivec.h" + + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS) + + +#define DO_FDCT_COMMON(PASS) { \ + /* (Original) \ + * z1 = (tmp12 + tmp13) * 0.541196100; \ + * data2 = z1 + tmp13 * 0.765366865; \ + * data6 = z1 + tmp12 * -1.847759065; \ + * \ + * (This implementation) \ + * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \ + * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \ + */ \ + \ + tmp1312l = vec_mergeh(tmp13, tmp12); \ + tmp1312h = vec_mergel(tmp13, tmp12); \ + \ + out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \ + out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \ + out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \ + out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \ + \ + out2l = vec_sra(out2l, descale_p##PASS); \ + out2h = vec_sra(out2h, descale_p##PASS); \ + out6l = vec_sra(out6l, descale_p##PASS); \ + out6h = vec_sra(out6h, descale_p##PASS); \ + \ + out2 = vec_pack(out2l, out2h); \ + out6 = vec_pack(out6l, out6h); \ + \ + /* Odd part */ \ + \ + z3 = vec_add(tmp4, tmp6); \ + z4 = vec_add(tmp5, tmp7); \ + \ + /* (Original) \ + * z5 = (z3 + z4) * 1.175875602; \ + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ + * z3 += z5; z4 += z5; \ + * \ + * (This implementation) \ + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ + */ \ + \ + z34l = vec_mergeh(z3, z4); \ + z34h = vec_mergel(z3, z4); \ + \ + z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \ + z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \ + z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \ + z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \ + \ + /* (Original) \ + * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \ + * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \ + * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \ + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ + * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \ + * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \ + * \ + * (This implementation) \ + * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \ + * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \ + * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \ + * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \ + * data7 = tmp4 + z3; data5 = tmp5 + z4; \ + * data3 = tmp6 + z3; data1 = tmp7 + z4; \ + */ \ + \ + tmp47l = vec_mergeh(tmp4, tmp7); \ + tmp47h = vec_mergel(tmp4, tmp7); \ + \ + out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \ + out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \ + out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \ + out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \ + \ + out7l = vec_sra(out7l, descale_p##PASS); \ + out7h = vec_sra(out7h, descale_p##PASS); \ + out1l = vec_sra(out1l, descale_p##PASS); \ + out1h = vec_sra(out1h, descale_p##PASS); \ + \ + out7 = vec_pack(out7l, out7h); \ + out1 = vec_pack(out1l, out1h); \ + \ + tmp56l = vec_mergeh(tmp5, tmp6); \ + tmp56h = vec_mergel(tmp5, tmp6); \ + \ + out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \ + out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \ + out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \ + out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \ + \ + out5l = vec_sra(out5l, descale_p##PASS); \ + out5h = vec_sra(out5h, descale_p##PASS); \ + out3l = vec_sra(out3l, descale_p##PASS); \ + out3h = vec_sra(out3h, descale_p##PASS); \ + \ + out5 = vec_pack(out5l, out5h); \ + out3 = vec_pack(out3l, out3h); \ +} + +#define DO_FDCT_PASS1() { \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out0 = vec_sl(out0, pass1_bits); \ + out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_sl(out4, pass1_bits); \ + \ + DO_FDCT_COMMON(1); \ +} + +#define DO_FDCT_PASS2() { \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out0 = vec_add(out0, pw_descale_p2x); \ + out0 = vec_sra(out0, pass1_bits); \ + out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_add(out4, pw_descale_p2x); \ + out4 = vec_sra(out4, pass1_bits); \ + \ + DO_FDCT_COMMON(2); \ +} + + +void jsimd_fdct_islow_altivec(DCTELEM *data) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h, + z3, z4, z34l, z34h, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector int z3l, z3h, z4l, z4h, + out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h, + out7l, out7h; + + /* Constants */ + __vector short + pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) }, + pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) }, + pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) }, + pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) }, + pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) }, + pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) }, + pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) }, + pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) }, + pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) }; + __vector unsigned short pass1_bits = { __8X(PASS1_BITS) }; + __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) }, + pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) }; + __vector unsigned int descale_p1 = { __4X(DESCALE_P1) }, + descale_p2 = { __4X(DESCALE_P2) }; + + /* Pass 1: process rows */ + + row0 = vec_ld(0, data); + row1 = vec_ld(16, data); + row2 = vec_ld(32, data); + row3 = vec_ld(48, data); + row4 = vec_ld(64, data); + row5 = vec_ld(80, data); + row6 = vec_ld(96, data); + row7 = vec_ld(112, data); + + TRANSPOSE(row, col); + + tmp0 = vec_add(col0, col7); + tmp7 = vec_sub(col0, col7); + tmp1 = vec_add(col1, col6); + tmp6 = vec_sub(col1, col6); + tmp2 = vec_add(col2, col5); + tmp5 = vec_sub(col2, col5); + tmp3 = vec_add(col3, col4); + tmp4 = vec_sub(col3, col4); + + DO_FDCT_PASS1(); + + /* Pass 2: process columns */ + + TRANSPOSE(out, row); + + tmp0 = vec_add(row0, row7); + tmp7 = vec_sub(row0, row7); + tmp1 = vec_add(row1, row6); + tmp6 = vec_sub(row1, row6); + tmp2 = vec_add(row2, row5); + tmp5 = vec_sub(row2, row5); + tmp3 = vec_add(row3, row4); + tmp4 = vec_sub(row3, row4); + + DO_FDCT_PASS2(); + + vec_st(out0, 0, data); + vec_st(out1, 16, data); + vec_st(out2, 32, data); + vec_st(out3, 48, data); + vec_st(out4, 64, data); + vec_st(out5, 80, data); + vec_st(out6, 96, data); + vec_st(out7, 112, data); +} diff --git a/media/libjpeg/simd/powerpc/jidctfst-altivec.c b/media/libjpeg/simd/powerpc/jidctfst-altivec.c new file mode 100644 index 0000000000..456c6c6174 --- /dev/null +++ b/media/libjpeg/simd/powerpc/jidctfst-altivec.c @@ -0,0 +1,255 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* FAST INTEGER INVERSE DCT + * + * This is similar to the SSE2 implementation, except that we left-shift the + * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because + * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: + * the elements in arg3 + the most significant 17 bits of + * (the elements in arg1 * the elements in arg2). + */ + +#include "jsimd_altivec.h" + + +#define F_1_082 277 /* FIX(1.082392200) */ +#define F_1_414 362 /* FIX(1.414213562) */ +#define F_1_847 473 /* FIX(1.847759065) */ +#define F_2_613 669 /* FIX(2.613125930) */ +#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */ + +#define CONST_BITS 8 +#define PASS1_BITS 2 +#define PRE_MULTIPLY_SCALE_BITS 2 +#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) + + +#define DO_IDCT(in) { \ + /* Even part */ \ + \ + tmp10 = vec_add(in##0, in##4); \ + tmp11 = vec_sub(in##0, in##4); \ + tmp13 = vec_add(in##2, in##6); \ + \ + tmp12 = vec_sub(in##2, in##6); \ + tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ + tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \ + tmp12 = vec_sub(tmp12, tmp13); \ + \ + tmp0 = vec_add(tmp10, tmp13); \ + tmp3 = vec_sub(tmp10, tmp13); \ + tmp1 = vec_add(tmp11, tmp12); \ + tmp2 = vec_sub(tmp11, tmp12); \ + \ + /* Odd part */ \ + \ + z13 = vec_add(in##5, in##3); \ + z10 = vec_sub(in##5, in##3); \ + z10s = vec_sl(z10, pre_multiply_scale_bits); \ + z11 = vec_add(in##1, in##7); \ + z12s = vec_sub(in##1, in##7); \ + z12s = vec_sl(z12s, pre_multiply_scale_bits); \ + \ + tmp11 = vec_sub(z11, z13); \ + tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ + tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \ + \ + tmp7 = vec_add(z11, z13); \ + \ + /* To avoid overflow... \ + * \ + * (Original) \ + * tmp12 = -2.613125930 * z10 + z5; \ + * \ + * (This implementation) \ + * tmp12 = (-1.613125930 - 1) * z10 + z5; \ + * = -1.613125930 * z10 - z10 + z5; \ + */ \ + \ + z5 = vec_add(z10s, z12s); \ + z5 = vec_madds(z5, pw_F1847, pw_zero); \ + \ + tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \ + tmp10 = vec_sub(tmp10, z5); \ + tmp12 = vec_madds(z10s, pw_MF1613, z5); \ + tmp12 = vec_sub(tmp12, z10); \ + \ + tmp6 = vec_sub(tmp12, tmp7); \ + tmp5 = vec_sub(tmp11, tmp6); \ + tmp4 = vec_add(tmp10, tmp5); \ + \ + out0 = vec_add(tmp0, tmp7); \ + out1 = vec_add(tmp1, tmp6); \ + out2 = vec_add(tmp2, tmp5); \ + out3 = vec_sub(tmp3, tmp4); \ + out4 = vec_add(tmp3, tmp4); \ + out5 = vec_sub(tmp2, tmp5); \ + out6 = vec_sub(tmp1, tmp6); \ + out7 = vec_sub(tmp0, tmp7); \ +} + + +void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + short *dct_table = (short *)dct_table_; + int *outptr; + + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + z5, z10, z10s, z11, z12s, z13, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector signed char outb; + + /* Constants */ + __vector short pw_zero = { __8X(0) }, + pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) }, + pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) }, + pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) }, + pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) }; + __vector unsigned short + pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }, + pass1_bits3 = { __8X(PASS1_BITS + 3) }; + __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; + + /* Pass 1: process columns */ + + col0 = vec_ld(0, coef_block); + col1 = vec_ld(16, coef_block); + col2 = vec_ld(32, coef_block); + col3 = vec_ld(48, coef_block); + col4 = vec_ld(64, coef_block); + col5 = vec_ld(80, coef_block); + col6 = vec_ld(96, coef_block); + col7 = vec_ld(112, coef_block); + + tmp1 = vec_or(col1, col2); + tmp2 = vec_or(col3, col4); + tmp1 = vec_or(tmp1, tmp2); + tmp3 = vec_or(col5, col6); + tmp3 = vec_or(tmp3, col7); + tmp1 = vec_or(tmp1, tmp3); + + quant0 = vec_ld(0, dct_table); + col0 = vec_mladd(col0, quant0, pw_zero); + + if (vec_all_eq(tmp1, pw_zero)) { + /* AC terms all zero */ + + row0 = vec_splat(col0, 0); + row1 = vec_splat(col0, 1); + row2 = vec_splat(col0, 2); + row3 = vec_splat(col0, 3); + row4 = vec_splat(col0, 4); + row5 = vec_splat(col0, 5); + row6 = vec_splat(col0, 6); + row7 = vec_splat(col0, 7); + + } else { + + quant1 = vec_ld(16, dct_table); + quant2 = vec_ld(32, dct_table); + quant3 = vec_ld(48, dct_table); + quant4 = vec_ld(64, dct_table); + quant5 = vec_ld(80, dct_table); + quant6 = vec_ld(96, dct_table); + quant7 = vec_ld(112, dct_table); + + col1 = vec_mladd(col1, quant1, pw_zero); + col2 = vec_mladd(col2, quant2, pw_zero); + col3 = vec_mladd(col3, quant3, pw_zero); + col4 = vec_mladd(col4, quant4, pw_zero); + col5 = vec_mladd(col5, quant5, pw_zero); + col6 = vec_mladd(col6, quant6, pw_zero); + col7 = vec_mladd(col7, quant7, pw_zero); + + DO_IDCT(col); + + TRANSPOSE(out, row); + } + + /* Pass 2: process rows */ + + DO_IDCT(row); + + out0 = vec_sra(out0, pass1_bits3); + out1 = vec_sra(out1, pass1_bits3); + out2 = vec_sra(out2, pass1_bits3); + out3 = vec_sra(out3, pass1_bits3); + out4 = vec_sra(out4, pass1_bits3); + out5 = vec_sra(out5, pass1_bits3); + out6 = vec_sra(out6, pass1_bits3); + out7 = vec_sra(out7, pass1_bits3); + + TRANSPOSE(out, col); + + outb = vec_packs(col0, col0); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[0] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col1, col1); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[1] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col2, col2); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[2] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col3, col3); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[3] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col4, col4); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[4] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col5, col5); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[5] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col6, col6); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[6] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col7, col7); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[7] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); +} diff --git a/media/libjpeg/simd/powerpc/jidctint-altivec.c b/media/libjpeg/simd/powerpc/jidctint-altivec.c new file mode 100644 index 0000000000..60e619f11d --- /dev/null +++ b/media/libjpeg/simd/powerpc/jidctint-altivec.c @@ -0,0 +1,357 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* ACCURATE INTEGER INVERSE DCT */ + +#include "jsimd_altivec.h" + + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + + +#define DO_IDCT(in, PASS) { \ + /* Even part \ + * \ + * (Original) \ + * z1 = (z2 + z3) * 0.541196100; \ + * tmp2 = z1 + z3 * -1.847759065; \ + * tmp3 = z1 + z2 * 0.765366865; \ + * \ + * (This implementation) \ + * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ + * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ + */ \ + \ + in##26l = vec_mergeh(in##2, in##6); \ + in##26h = vec_mergel(in##2, in##6); \ + \ + tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \ + tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \ + tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \ + tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \ + \ + tmp0 = vec_add(in##0, in##4); \ + tmp1 = vec_sub(in##0, in##4); \ + \ + tmp0l = vec_unpackh(tmp0); \ + tmp0h = vec_unpackl(tmp0); \ + tmp0l = vec_sl(tmp0l, const_bits); \ + tmp0h = vec_sl(tmp0h, const_bits); \ + tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \ + tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \ + \ + tmp10l = vec_add(tmp0l, tmp3l); \ + tmp10h = vec_add(tmp0h, tmp3h); \ + tmp13l = vec_sub(tmp0l, tmp3l); \ + tmp13h = vec_sub(tmp0h, tmp3h); \ + \ + tmp1l = vec_unpackh(tmp1); \ + tmp1h = vec_unpackl(tmp1); \ + tmp1l = vec_sl(tmp1l, const_bits); \ + tmp1h = vec_sl(tmp1h, const_bits); \ + tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \ + tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \ + \ + tmp11l = vec_add(tmp1l, tmp2l); \ + tmp11h = vec_add(tmp1h, tmp2h); \ + tmp12l = vec_sub(tmp1l, tmp2l); \ + tmp12h = vec_sub(tmp1h, tmp2h); \ + \ + /* Odd part */ \ + \ + z3 = vec_add(in##3, in##7); \ + z4 = vec_add(in##1, in##5); \ + \ + /* (Original) \ + * z5 = (z3 + z4) * 1.175875602; \ + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ + * z3 += z5; z4 += z5; \ + * \ + * (This implementation) \ + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ + */ \ + \ + z34l = vec_mergeh(z3, z4); \ + z34h = vec_mergel(z3, z4); \ + \ + z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \ + z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \ + z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \ + z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \ + \ + /* (Original) \ + * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \ + * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \ + * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \ + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ + * tmp0 += z1 + z3; tmp1 += z2 + z4; \ + * tmp2 += z2 + z3; tmp3 += z1 + z4; \ + * \ + * (This implementation) \ + * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \ + * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \ + * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \ + * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \ + * tmp0 += z3; tmp1 += z4; \ + * tmp2 += z3; tmp3 += z4; \ + */ \ + \ + in##71l = vec_mergeh(in##7, in##1); \ + in##71h = vec_mergel(in##7, in##1); \ + \ + tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \ + tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \ + tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \ + tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \ + \ + in##53l = vec_mergeh(in##5, in##3); \ + in##53h = vec_mergel(in##5, in##3); \ + \ + tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \ + tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \ + tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \ + tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \ + \ + /* Final output stage */ \ + \ + out0l = vec_add(tmp10l, tmp3l); \ + out0h = vec_add(tmp10h, tmp3h); \ + out7l = vec_sub(tmp10l, tmp3l); \ + out7h = vec_sub(tmp10h, tmp3h); \ + \ + out0l = vec_sra(out0l, descale_p##PASS); \ + out0h = vec_sra(out0h, descale_p##PASS); \ + out7l = vec_sra(out7l, descale_p##PASS); \ + out7h = vec_sra(out7h, descale_p##PASS); \ + \ + out0 = vec_pack(out0l, out0h); \ + out7 = vec_pack(out7l, out7h); \ + \ + out1l = vec_add(tmp11l, tmp2l); \ + out1h = vec_add(tmp11h, tmp2h); \ + out6l = vec_sub(tmp11l, tmp2l); \ + out6h = vec_sub(tmp11h, tmp2h); \ + \ + out1l = vec_sra(out1l, descale_p##PASS); \ + out1h = vec_sra(out1h, descale_p##PASS); \ + out6l = vec_sra(out6l, descale_p##PASS); \ + out6h = vec_sra(out6h, descale_p##PASS); \ + \ + out1 = vec_pack(out1l, out1h); \ + out6 = vec_pack(out6l, out6h); \ + \ + out2l = vec_add(tmp12l, tmp1l); \ + out2h = vec_add(tmp12h, tmp1h); \ + out5l = vec_sub(tmp12l, tmp1l); \ + out5h = vec_sub(tmp12h, tmp1h); \ + \ + out2l = vec_sra(out2l, descale_p##PASS); \ + out2h = vec_sra(out2h, descale_p##PASS); \ + out5l = vec_sra(out5l, descale_p##PASS); \ + out5h = vec_sra(out5h, descale_p##PASS); \ + \ + out2 = vec_pack(out2l, out2h); \ + out5 = vec_pack(out5l, out5h); \ + \ + out3l = vec_add(tmp13l, tmp0l); \ + out3h = vec_add(tmp13h, tmp0h); \ + out4l = vec_sub(tmp13l, tmp0l); \ + out4h = vec_sub(tmp13h, tmp0h); \ + \ + out3l = vec_sra(out3l, descale_p##PASS); \ + out3h = vec_sra(out3h, descale_p##PASS); \ + out4l = vec_sra(out4l, descale_p##PASS); \ + out4h = vec_sra(out4h, descale_p##PASS); \ + \ + out3 = vec_pack(out3l, out3h); \ + out4 = vec_pack(out4l, out4h); \ +} + + +void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + short *dct_table = (short *)dct_table_; + int *outptr; + + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, + tmp0, tmp1, tmp2, tmp3, z3, z4, + z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h, + row71l, row71h, row26l, row26h, row53l, row53h, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h, + tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h, + z3l, z3h, z4l, z4h, + out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h, + out5l, out5h, out6l, out6h, out7l, out7h; + __vector signed char outb; + + /* Constants */ + __vector short pw_zero = { __8X(0) }, + pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) }, + pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) }, + pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) }, + pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) }, + pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) }, + pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) }, + pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) }, + pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) }; + __vector unsigned short pass1_bits = { __8X(PASS1_BITS) }; + __vector int pd_zero = { __4X(0) }, + pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) }, + pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) }; + __vector unsigned int descale_p1 = { __4X(DESCALE_P1) }, + descale_p2 = { __4X(DESCALE_P2) }, + const_bits = { __4X(CONST_BITS) }; + __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; + + /* Pass 1: process columns */ + + col0 = vec_ld(0, coef_block); + col1 = vec_ld(16, coef_block); + col2 = vec_ld(32, coef_block); + col3 = vec_ld(48, coef_block); + col4 = vec_ld(64, coef_block); + col5 = vec_ld(80, coef_block); + col6 = vec_ld(96, coef_block); + col7 = vec_ld(112, coef_block); + + tmp1 = vec_or(col1, col2); + tmp2 = vec_or(col3, col4); + tmp1 = vec_or(tmp1, tmp2); + tmp3 = vec_or(col5, col6); + tmp3 = vec_or(tmp3, col7); + tmp1 = vec_or(tmp1, tmp3); + + quant0 = vec_ld(0, dct_table); + col0 = vec_mladd(col0, quant0, pw_zero); + + if (vec_all_eq(tmp1, pw_zero)) { + /* AC terms all zero */ + + col0 = vec_sl(col0, pass1_bits); + + row0 = vec_splat(col0, 0); + row1 = vec_splat(col0, 1); + row2 = vec_splat(col0, 2); + row3 = vec_splat(col0, 3); + row4 = vec_splat(col0, 4); + row5 = vec_splat(col0, 5); + row6 = vec_splat(col0, 6); + row7 = vec_splat(col0, 7); + + } else { + + quant1 = vec_ld(16, dct_table); + quant2 = vec_ld(32, dct_table); + quant3 = vec_ld(48, dct_table); + quant4 = vec_ld(64, dct_table); + quant5 = vec_ld(80, dct_table); + quant6 = vec_ld(96, dct_table); + quant7 = vec_ld(112, dct_table); + + col1 = vec_mladd(col1, quant1, pw_zero); + col2 = vec_mladd(col2, quant2, pw_zero); + col3 = vec_mladd(col3, quant3, pw_zero); + col4 = vec_mladd(col4, quant4, pw_zero); + col5 = vec_mladd(col5, quant5, pw_zero); + col6 = vec_mladd(col6, quant6, pw_zero); + col7 = vec_mladd(col7, quant7, pw_zero); + + DO_IDCT(col, 1); + + TRANSPOSE(out, row); + } + + /* Pass 2: process rows */ + + DO_IDCT(row, 2); + + TRANSPOSE(out, col); + + outb = vec_packs(col0, col0); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[0] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col1, col1); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[1] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col2, col2); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[2] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col3, col3); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[3] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col4, col4); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[4] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col5, col5); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[5] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col6, col6); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[6] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col7, col7); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[7] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); +} diff --git a/media/libjpeg/simd/powerpc/jquanti-altivec.c b/media/libjpeg/simd/powerpc/jquanti-altivec.c new file mode 100644 index 0000000000..7d6e32542b --- /dev/null +++ b/media/libjpeg/simd/powerpc/jquanti-altivec.c @@ -0,0 +1,250 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ + +#include "jsimd_altivec.h" + + +/* NOTE: The address will either be aligned or offset by 8 bytes, so we can + * always get the data we want by using a single vector load (although we may + * have to permute the result.) + */ +#if __BIG_ENDIAN__ + +#define LOAD_ROW(row) { \ + elemptr = sample_data[row] + start_col; \ + in##row = vec_ld(0, elemptr); \ + if ((size_t)elemptr & 15) \ + in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \ +} + +#else + +#define LOAD_ROW(row) { \ + elemptr = sample_data[row] + start_col; \ + in##row = vec_vsx_ld(0, elemptr); \ +} + +#endif + + +void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + JSAMPROW elemptr; + + __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7; + __vector short out0, out1, out2, out3, out4, out5, out6, out7; + + /* Constants */ + __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) }; + __vector unsigned char pb_zero = { __16X(0) }; + + LOAD_ROW(0); + LOAD_ROW(1); + LOAD_ROW(2); + LOAD_ROW(3); + LOAD_ROW(4); + LOAD_ROW(5); + LOAD_ROW(6); + LOAD_ROW(7); + + out0 = (__vector short)VEC_UNPACKHU(in0); + out1 = (__vector short)VEC_UNPACKHU(in1); + out2 = (__vector short)VEC_UNPACKHU(in2); + out3 = (__vector short)VEC_UNPACKHU(in3); + out4 = (__vector short)VEC_UNPACKHU(in4); + out5 = (__vector short)VEC_UNPACKHU(in5); + out6 = (__vector short)VEC_UNPACKHU(in6); + out7 = (__vector short)VEC_UNPACKHU(in7); + + out0 = vec_sub(out0, pw_centerjsamp); + out1 = vec_sub(out1, pw_centerjsamp); + out2 = vec_sub(out2, pw_centerjsamp); + out3 = vec_sub(out3, pw_centerjsamp); + out4 = vec_sub(out4, pw_centerjsamp); + out5 = vec_sub(out5, pw_centerjsamp); + out6 = vec_sub(out6, pw_centerjsamp); + out7 = vec_sub(out7, pw_centerjsamp); + + vec_st(out0, 0, workspace); + vec_st(out1, 16, workspace); + vec_st(out2, 32, workspace); + vec_st(out3, 48, workspace); + vec_st(out4, 64, workspace); + vec_st(out5, 80, workspace); + vec_st(out6, 96, workspace); + vec_st(out7, 112, workspace); +} + + +#define WORD_BIT 16 + +/* There is no AltiVec 16-bit unsigned multiply instruction, hence this. + We basically need an unsigned equivalent of vec_madds(). */ + +#define MULTIPLY(vs0, vs1, out) { \ + tmpe = vec_mule((__vector unsigned short)vs0, \ + (__vector unsigned short)vs1); \ + tmpo = vec_mulo((__vector unsigned short)vs0, \ + (__vector unsigned short)vs1); \ + out = (__vector short)vec_perm((__vector unsigned short)tmpe, \ + (__vector unsigned short)tmpo, \ + shift_pack_index); \ +} + +void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s, + corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7, + recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7, + scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7; + __vector unsigned int tmpe, tmpo; + + /* Constants */ + __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) }; +#if __BIG_ENDIAN__ + __vector unsigned char shift_pack_index = + { 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 }; +#else + __vector unsigned char shift_pack_index = + { 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 }; +#endif + + row0 = vec_ld(0, workspace); + row1 = vec_ld(16, workspace); + row2 = vec_ld(32, workspace); + row3 = vec_ld(48, workspace); + row4 = vec_ld(64, workspace); + row5 = vec_ld(80, workspace); + row6 = vec_ld(96, workspace); + row7 = vec_ld(112, workspace); + + /* Branch-less absolute value */ + row0s = vec_sra(row0, pw_word_bit_m1); + row1s = vec_sra(row1, pw_word_bit_m1); + row2s = vec_sra(row2, pw_word_bit_m1); + row3s = vec_sra(row3, pw_word_bit_m1); + row4s = vec_sra(row4, pw_word_bit_m1); + row5s = vec_sra(row5, pw_word_bit_m1); + row6s = vec_sra(row6, pw_word_bit_m1); + row7s = vec_sra(row7, pw_word_bit_m1); + row0 = vec_xor(row0, row0s); + row1 = vec_xor(row1, row1s); + row2 = vec_xor(row2, row2s); + row3 = vec_xor(row3, row3s); + row4 = vec_xor(row4, row4s); + row5 = vec_xor(row5, row5s); + row6 = vec_xor(row6, row6s); + row7 = vec_xor(row7, row7s); + row0 = vec_sub(row0, row0s); + row1 = vec_sub(row1, row1s); + row2 = vec_sub(row2, row2s); + row3 = vec_sub(row3, row3s); + row4 = vec_sub(row4, row4s); + row5 = vec_sub(row5, row5s); + row6 = vec_sub(row6, row6s); + row7 = vec_sub(row7, row7s); + + corr0 = vec_ld(DCTSIZE2 * 2, divisors); + corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors); + corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors); + corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors); + corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors); + corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors); + corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors); + corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors); + + row0 = vec_add(row0, corr0); + row1 = vec_add(row1, corr1); + row2 = vec_add(row2, corr2); + row3 = vec_add(row3, corr3); + row4 = vec_add(row4, corr4); + row5 = vec_add(row5, corr5); + row6 = vec_add(row6, corr6); + row7 = vec_add(row7, corr7); + + recip0 = vec_ld(0, divisors); + recip1 = vec_ld(16, divisors); + recip2 = vec_ld(32, divisors); + recip3 = vec_ld(48, divisors); + recip4 = vec_ld(64, divisors); + recip5 = vec_ld(80, divisors); + recip6 = vec_ld(96, divisors); + recip7 = vec_ld(112, divisors); + + MULTIPLY(row0, recip0, row0); + MULTIPLY(row1, recip1, row1); + MULTIPLY(row2, recip2, row2); + MULTIPLY(row3, recip3, row3); + MULTIPLY(row4, recip4, row4); + MULTIPLY(row5, recip5, row5); + MULTIPLY(row6, recip6, row6); + MULTIPLY(row7, recip7, row7); + + scale0 = vec_ld(DCTSIZE2 * 4, divisors); + scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors); + scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors); + scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors); + scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors); + scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors); + scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors); + scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors); + + MULTIPLY(row0, scale0, row0); + MULTIPLY(row1, scale1, row1); + MULTIPLY(row2, scale2, row2); + MULTIPLY(row3, scale3, row3); + MULTIPLY(row4, scale4, row4); + MULTIPLY(row5, scale5, row5); + MULTIPLY(row6, scale6, row6); + MULTIPLY(row7, scale7, row7); + + row0 = vec_xor(row0, row0s); + row1 = vec_xor(row1, row1s); + row2 = vec_xor(row2, row2s); + row3 = vec_xor(row3, row3s); + row4 = vec_xor(row4, row4s); + row5 = vec_xor(row5, row5s); + row6 = vec_xor(row6, row6s); + row7 = vec_xor(row7, row7s); + row0 = vec_sub(row0, row0s); + row1 = vec_sub(row1, row1s); + row2 = vec_sub(row2, row2s); + row3 = vec_sub(row3, row3s); + row4 = vec_sub(row4, row4s); + row5 = vec_sub(row5, row5s); + row6 = vec_sub(row6, row6s); + row7 = vec_sub(row7, row7s); + + vec_st(row0, 0, coef_block); + vec_st(row1, 16, coef_block); + vec_st(row2, 32, coef_block); + vec_st(row3, 48, coef_block); + vec_st(row4, 64, coef_block); + vec_st(row5, 80, coef_block); + vec_st(row6, 96, coef_block); + vec_st(row7, 112, coef_block); +} diff --git a/media/libjpeg/simd/powerpc/jsimd.c b/media/libjpeg/simd/powerpc/jsimd.c new file mode 100644 index 0000000000..d0d3981e06 --- /dev/null +++ b/media/libjpeg/simd/powerpc/jsimd.c @@ -0,0 +1,872 @@ +/* + * jsimd_powerpc.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * PowerPC architecture. + */ + +#ifdef __amigaos4__ +/* This must be defined first as it re-defines GLOBAL otherwise */ +#include <proto/exec.h> +#endif + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +#include <stdio.h> +#include <string.h> +#include <ctype.h> + +#if defined(__OpenBSD__) +#include <sys/param.h> +#include <sys/sysctl.h> +#include <machine/cpu.h> +#endif + +static unsigned int simd_support = ~0; + +#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_feature(char *buffer, char *feature) +{ + char *p; + + if (*feature == 0) + return 0; + if (strncmp(buffer, "cpu", 3) != 0) + return 0; + buffer += 3; + while (isspace(*buffer)) + buffer++; + + /* Check if 'feature' is present in the buffer as a separate word */ + while ((p = strstr(buffer, feature))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(feature); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo(int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + + simd_support = 0; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_feature(buffer, "altivec")) + simd_support |= JSIMD_ALTIVEC; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char *env = NULL; +#endif +#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#elif defined(__amigaos4__) + uint32 altivec = 0; +#elif defined(__OpenBSD__) + int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC }; + int altivec; + size_t len = sizeof(altivec); +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__ALTIVEC__) || defined(__APPLE__) + simd_support |= JSIMD_ALTIVEC; +#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#elif defined(__amigaos4__) + IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE); + if (altivec == VECTORTYPE_ALTIVEC) + simd_support |= JSIMD_ALTIVEC; +#elif defined(__OpenBSD__) + if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0) + simd_support |= JSIMD_ALTIVEC; +#endif + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCEALTIVEC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_ALTIVEC; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + altivecfct = jsimd_extrgb_ycc_convert_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct = jsimd_extrgbx_ycc_convert_altivec; + break; + case JCS_EXT_BGR: + altivecfct = jsimd_extbgr_ycc_convert_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct = jsimd_extbgrx_ycc_convert_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct = jsimd_extxbgr_ycc_convert_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct = jsimd_extxrgb_ycc_convert_altivec; + break; + default: + altivecfct = jsimd_rgb_ycc_convert_altivec; + break; + } + + altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + altivecfct = jsimd_extrgb_gray_convert_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct = jsimd_extrgbx_gray_convert_altivec; + break; + case JCS_EXT_BGR: + altivecfct = jsimd_extbgr_gray_convert_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct = jsimd_extbgrx_gray_convert_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct = jsimd_extxbgr_gray_convert_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct = jsimd_extxrgb_gray_convert_altivec; + break; + default: + altivecfct = jsimd_rgb_gray_convert_altivec; + break; + } + + altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + altivecfct = jsimd_ycc_extrgb_convert_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct = jsimd_ycc_extrgbx_convert_altivec; + break; + case JCS_EXT_BGR: + altivecfct = jsimd_ycc_extbgr_convert_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct = jsimd_ycc_extbgrx_convert_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct = jsimd_ycc_extxbgr_convert_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct = jsimd_ycc_extxrgb_convert_altivec; + break; + default: + altivecfct = jsimd_ycc_rgb_convert_altivec; + break; + } + + altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec; + break; + case JCS_EXT_BGR: + altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec; + break; + default: + altivecfct = jsimd_h2v2_merged_upsample_altivec; + break; + } + + altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec; + break; + case JCS_EXT_BGR: + altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec; + break; + default: + altivecfct = jsimd_h2v1_merged_upsample_altivec; + break; + } + + altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_altivec(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + jsimd_fdct_islow_altivec(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + jsimd_fdct_ifast_altivec(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + jsimd_quantize_altivec(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return NULL; +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return 0; +} diff --git a/media/libjpeg/simd/powerpc/jsimd_altivec.h b/media/libjpeg/simd/powerpc/jsimd_altivec.h new file mode 100644 index 0000000000..e8bdb06a54 --- /dev/null +++ b/media/libjpeg/simd/powerpc/jsimd_altivec.h @@ -0,0 +1,98 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include <altivec.h> + + +/* Common code */ + +#define __4X(a) a, a, a, a +#define __4X2(a, b) a, b, a, b, a, b, a, b +#define __8X(a) __4X(a), __4X(a) +#define __16X(a) __8X(a), __8X(a) + +#define TRANSPOSE(row, col) { \ + __vector short row04l, row04h, row15l, row15h, \ + row26l, row26h, row37l, row37h; \ + __vector short col01e, col01o, col23e, col23o, \ + col45e, col45o, col67e, col67o; \ + \ + /* transpose coefficients (phase 1) */ \ + row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \ + row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \ + row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \ + row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \ + row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \ + row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \ + row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \ + row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \ + \ + /* transpose coefficients (phase 2) */ \ + col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \ + col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \ + col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \ + col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \ + col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \ + col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \ + col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \ + col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \ + \ + /* transpose coefficients (phase 3) */ \ + col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \ + col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \ + col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \ + col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \ + col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \ + col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \ + col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \ + col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \ +} + +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif + + +/* Macros to abstract big/little endian bit twiddling */ + +#if __BIG_ENDIAN__ + +#define VEC_LD(a, b) vec_ld(a, b) +#define VEC_ST(a, b, c) vec_st(a, b, c) +#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a) +#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a) + +#else + +#define VEC_LD(a, b) vec_vsx_ld(a, b) +#define VEC_ST(a, b, c) vec_vsx_st(a, b, c) +#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero) +#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero) + +#endif diff --git a/media/libjpeg/simd/x86_64/jccolext-avx2.asm b/media/libjpeg/simd/x86_64/jccolext-avx2.asm new file mode 100644 index 0000000000..10d28348a9 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jccolext-avx2.asm @@ -0,0 +1,558 @@ +; +; jccolext.asm - colorspace conversion (64-bit AVX2) +; +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +; r10d = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13d = JDIMENSION output_row +; r14d = int num_rows + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define WK_NUM 8 + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2) + +EXTN(jsimd_rgb_ycc_convert_avx2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d + test rcx, rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rdx + push rbx + push rdi + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 + mov rbx, JSAMPROW [rbx] ; outptr1 + mov rdx, JSAMPROW [rdx] ; outptr2 + + cmp rcx, byte SIZEOF_YMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, byte [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, word [rsi+rcx] + shl rax, WORD_BIT + or rax, rdx +.column_ld4: + vmovd xmmA, eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + vmovd xmmF, XMM_DWORD [rsi+rcx] + vpslldq xmmA, xmmA, SIZEOF_DWORD + vpor xmmA, xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + vmovq xmmB, XMM_MMWORD [rsi+rcx] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + sub rcx, byte SIZEOF_XMMWORD + vmovdqu xmmB, XMM_MMWORD [rsi+rcx] + vperm2i128 ymmA, ymmA, ymmA, 1 + vpor ymmA, ymmB +.column_ld32: + test cl, SIZEOF_YMMWORD + jz short .column_ld64 + sub rcx, byte SIZEOF_YMMWORD + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] +.column_ld64: + test cl, 2*SIZEOF_YMMWORD + mov rcx, SIZEOF_YMMWORD + jz short .rgb_ycc_cnv + vmovdqa ymmB, ymmA + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + jmp short .rgb_ycc_cnv + +.columnloop: + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD] + +.rgb_ycc_cnv: + ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vmovdqu ymmC, ymmA + vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + + vmovdqa ymmG, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 + ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) + vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I + ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A + ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) + vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 + ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) + + vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D + ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) + vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F + ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) + + vmovdqa ymmD, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 + ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) + vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P + ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) + vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B + ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) + + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E + ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F + ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) + + vmovdqa ymmE, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C + ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) + vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S + ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D + ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) + + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F + ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F + ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) + + vpxor ymmH, ymmH, ymmH + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmB, ymmE + vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + + vmovdqa ymmF, ymmD + vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmF +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + vmovdqa xmmF, xmmA + vperm2i128 ymmF, ymmF, ymmF, 1 + vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] + vpor ymmA, ymmA, ymmF +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + jz short .column_ld16 + sub rcx, byte SIZEOF_XMMWORD/2 + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld16: + test cl, SIZEOF_XMMWORD + mov rcx, SIZEOF_YMMWORD + jz short .rgb_ycc_cnv + vmovdqa ymmE, ymmA + vmovdqa ymmH, ymmF + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + jmp short .rgb_ycc_cnv + +.columnloop: + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD] + vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD] + +.rgb_ycc_cnv: + ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmB, ymmA + vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + + vmovdqa ymmB, ymmF + vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmD, ymmA + vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 + ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) + vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 + ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) + + vmovdqa ymmC, ymmF + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D + ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F + ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) + + vmovdqa ymmB, ymmA + vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) + vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D + ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) + + vmovdqa ymmG, ymmD + vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E + ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) + vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F + ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) + + vmovdqa ymmE, ymmA + vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E + ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) + + vmovdqa ymmH, ymmB + vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F + ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F + ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) + + vpxor ymmF, ymmF, ymmF + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmD, ymmB + vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + + vmovdqa ymmG, ymmE + vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) + + vpunpcklbw ymmF, ymmF, ymmH + vpunpckhbw ymmH, ymmH, ymmH + vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE + ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE + vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO + vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE + vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO + + vmovdqa ymm6, ymm1 + vpunpcklwd ymm1, ymm1, ymm3 + vpunpckhwd ymm6, ymm6, ymm3 + vmovdqa ymm7, ymm1 + vmovdqa ymm4, ymm6 + vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) + vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) + vpmaddwd ymm7, ymm7, [rel PW_MF016_MF033] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + vpxor ymm1, ymm1, ymm1 + vpxor ymm6, ymm6, ymm6 + vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL + vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH + vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500) + vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500) + + vmovdqa ymm5, [rel PD_ONEHALFM1_CJ] ; ymm5=[PD_ONEHALFM1_CJ] + + vpaddd ymm7, ymm7, ymm1 + vpaddd ymm4, ymm4, ymm6 + vpaddd ymm7, ymm7, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH + vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO + + vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE + + vmovdqa ymm6, ymm0 + vpunpcklwd ymm0, ymm0, ymm2 + vpunpckhwd ymm6, ymm6, ymm2 + vmovdqa ymm5, ymm0 + vmovdqa ymm4, ymm6 + vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) + vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) + vpmaddwd ymm5, ymm5, [rel PW_MF016_MF033] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + vpxor ymm0, ymm0, ymm0 + vpxor ymm6, ymm6, ymm6 + vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL + vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH + vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500) + vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500) + + vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ] + + vpaddd ymm5, ymm5, ymm0 + vpaddd ymm4, ymm4, ymm6 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm4, ymm4, ymm1 + vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH + vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE + + vpsllw ymm7, ymm7, BYTE_BIT + vpor ymm5, ymm5, ymm7 ; ymm5=Cb + vmovdqu YMMWORD [rbx], ymm5 ; Save Cb + + vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO + vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE + vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO + + vmovdqa ymm4, ymm0 + vpunpcklwd ymm0, ymm0, ymm3 + vpunpckhwd ymm4, ymm4, ymm3 + vmovdqa ymm7, ymm0 + vmovdqa ymm5, ymm4 + vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) + vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) + vpmaddwd ymm7, ymm7, [rel PW_MF008_MF041] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF] + + vpaddd ymm0, ymm0, YMMWORD [wk(4)] + vpaddd ymm4, ymm4, YMMWORD [wk(5)] + vpaddd ymm0, ymm0, ymm3 + vpaddd ymm4, ymm4, ymm3 + vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH + vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO + + vpxor ymm3, ymm3, ymm3 + vpxor ymm4, ymm4, ymm4 + vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL + vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH + vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500) + vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500) + + vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ] + + vpaddd ymm7, ymm7, ymm3 + vpaddd ymm5, ymm5, ymm4 + vpaddd ymm7, ymm7, ymm1 + vpaddd ymm5, ymm5, ymm1 + vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL + vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH + vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO + + vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE + + vmovdqa ymm4, ymm6 + vpunpcklwd ymm6, ymm6, ymm2 + vpunpckhwd ymm4, ymm4, ymm2 + vmovdqa ymm1, ymm6 + vmovdqa ymm5, ymm4 + vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) + vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) + vpmaddwd ymm1, ymm1, [rel PW_MF008_MF041] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF] + + vpaddd ymm6, ymm6, YMMWORD [wk(6)] + vpaddd ymm4, ymm4, YMMWORD [wk(7)] + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm4, ymm4, ymm2 + vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH + vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE + + vpsllw ymm0, ymm0, BYTE_BIT + vpor ymm6, ymm6, ymm0 ; ymm6=Y + vmovdqu YMMWORD [rdi], ymm6 ; Save Y + + vpxor ymm2, ymm2, ymm2 + vpxor ymm4, ymm4, ymm4 + vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL + vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH + vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500) + vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500) + + vmovdqa ymm0, [rel PD_ONEHALFM1_CJ] ; ymm0=[PD_ONEHALFM1_CJ] + + vpaddd ymm1, ymm1, ymm2 + vpaddd ymm5, ymm5, ymm4 + vpaddd ymm1, ymm1, ymm0 + vpaddd ymm5, ymm5, ymm0 + vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL + vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH + vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE + + vpsllw ymm7, ymm7, BYTE_BIT + vpor ymm1, ymm1, ymm7 ; ymm1=Cr + vmovdqu YMMWORD [rdx], ymm1 ; Save Cr + + sub rcx, byte SIZEOF_YMMWORD + add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr + add rdi, byte SIZEOF_YMMWORD ; outptr0 + add rbx, byte SIZEOF_YMMWORD ; outptr1 + add rdx, byte SIZEOF_YMMWORD ; outptr2 + cmp rcx, byte SIZEOF_YMMWORD + jae near .columnloop + test rcx, rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + pop rbx + pop rdx + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + vzeroupper + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jccolext-sse2.asm b/media/libjpeg/simd/x86_64/jccolext-sse2.asm new file mode 100644 index 0000000000..2c914d3183 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jccolext-sse2.asm @@ -0,0 +1,483 @@ +; +; jccolext.asm - colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +; r10d = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13d = JDIMENSION output_row +; r14d = int num_rows + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 8 + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2) + +EXTN(jsimd_rgb_ycc_convert_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d + test rcx, rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rdx + push rbx + push rdi + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 + mov rbx, JSAMPROW [rbx] ; outptr1 + mov rdx, JSAMPROW [rdx] ; outptr2 + + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, byte [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, word [rsi+rcx] + shl rax, WORD_BIT + or rax, rdx +.column_ld4: + movd xmmA, eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF, xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB, xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH, xmmH + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB, xmmE + punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF, xmmD + punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE, xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF, xmmA + movdqa xmmH, xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC, xmmF + punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB, xmmA + punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG, xmmD + punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE, xmmA + punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH, xmmB + punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF, xmmF + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD, xmmB + punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG, xmmE + punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF, xmmH + punpckhbw xmmH, xmmH + psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm6, xmm3 + movdqa xmm7, xmm1 + movdqa xmm4, xmm6 + pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1, xmm1 + pxor xmm6, xmm6 + punpcklwd xmm1, xmm5 ; xmm1=BOL + punpckhwd xmm6, xmm5 ; xmm6=BOH + psrld xmm1, 1 ; xmm1=BOL*FIX(0.500) + psrld xmm6, 1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7, xmm1 + paddd xmm4, xmm6 + paddd xmm7, xmm5 + paddd xmm4, xmm5 + psrld xmm7, SCALEBITS ; xmm7=CbOL + psrld xmm4, SCALEBITS ; xmm4=CbOH + packssdw xmm7, xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm6, xmm2 + movdqa xmm5, xmm0 + movdqa xmm4, xmm6 + pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0, xmm0 + pxor xmm6, xmm6 + punpcklwd xmm0, xmm1 ; xmm0=BEL + punpckhwd xmm6, xmm1 ; xmm6=BEH + psrld xmm0, 1 ; xmm0=BEL*FIX(0.500) + psrld xmm6, 1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5, xmm0 + paddd xmm4, xmm6 + paddd xmm5, xmm1 + paddd xmm4, xmm1 + psrld xmm5, SCALEBITS ; xmm5=CbEL + psrld xmm4, SCALEBITS ; xmm4=CbEH + packssdw xmm5, xmm4 ; xmm5=CbE + + psllw xmm7, BYTE_BIT + por xmm5, xmm7 ; xmm5=Cb + movdqa XMMWORD [rbx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 + punpckhwd xmm4, xmm3 + movdqa xmm7, xmm0 + movdqa xmm5, xmm4 + pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0, xmm3 + paddd xmm4, xmm3 + psrld xmm0, SCALEBITS ; xmm0=YOL + psrld xmm4, SCALEBITS ; xmm4=YOH + packssdw xmm0, xmm4 ; xmm0=YO + + pxor xmm3, xmm3 + pxor xmm4, xmm4 + punpcklwd xmm3, xmm1 ; xmm3=ROL + punpckhwd xmm4, xmm1 ; xmm4=ROH + psrld xmm3, 1 ; xmm3=ROL*FIX(0.500) + psrld xmm4, 1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7, xmm3 + paddd xmm5, xmm4 + paddd xmm7, xmm1 + paddd xmm5, xmm1 + psrld xmm7, SCALEBITS ; xmm7=CrOL + psrld xmm5, SCALEBITS ; xmm5=CrOH + packssdw xmm7, xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4, xmm6 + punpcklwd xmm6, xmm2 + punpckhwd xmm4, xmm2 + movdqa xmm1, xmm6 + movdqa xmm5, xmm4 + pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6, xmm2 + paddd xmm4, xmm2 + psrld xmm6, SCALEBITS ; xmm6=YEL + psrld xmm4, SCALEBITS ; xmm4=YEH + packssdw xmm6, xmm4 ; xmm6=YE + + psllw xmm0, BYTE_BIT + por xmm6, xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + pxor xmm2, xmm2 + pxor xmm4, xmm4 + punpcklwd xmm2, xmm3 ; xmm2=REL + punpckhwd xmm4, xmm3 ; xmm4=REH + psrld xmm2, 1 ; xmm2=REL*FIX(0.500) + psrld xmm4, 1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1, xmm2 + paddd xmm5, xmm4 + paddd xmm1, xmm0 + paddd xmm5, xmm0 + psrld xmm1, SCALEBITS ; xmm1=CrEL + psrld xmm5, SCALEBITS ; xmm5=CrEH + packssdw xmm1, xmm5 ; xmm1=CrE + + psllw xmm7, BYTE_BIT + por xmm1, xmm7 ; xmm1=Cr + movdqa XMMWORD [rdx], xmm1 ; Save Cr + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + add rbx, byte SIZEOF_XMMWORD ; outptr1 + add rdx, byte SIZEOF_XMMWORD ; outptr2 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx, rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + pop rbx + pop rdx + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jccolor-avx2.asm b/media/libjpeg/simd/x86_64/jccolor-avx2.asm new file mode 100644 index 0000000000..16b78298dc --- /dev/null +++ b/media/libjpeg/simd/x86_64/jccolor-avx2.asm @@ -0,0 +1,121 @@ +; +; jccolor.asm - colorspace conversion (64-bit AVX2) +; +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_ycc_convert_avx2) + +EXTN(jconst_rgb_ycc_convert_avx2): + +PW_F0299_F0337 times 8 dw F_0_299, F_0_337 +PW_F0114_F0250 times 8 dw F_0_114, F_0_250 +PW_MF016_MF033 times 8 dw -F_0_168, -F_0_331 +PW_MF008_MF041 times 8 dw -F_0_081, -F_0_418 +PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \ + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2 +%include "jccolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2 +%include "jccolext-avx2.asm" diff --git a/media/libjpeg/simd/x86_64/jccolor-sse2.asm b/media/libjpeg/simd/x86_64/jccolor-sse2.asm new file mode 100644 index 0000000000..e2955c2134 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jccolor-sse2.asm @@ -0,0 +1,120 @@ +; +; jccolor.asm - colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_ycc_convert_sse2) + +EXTN(jconst_rgb_ycc_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PW_MF016_MF033 times 4 dw -F_0_168, -F_0_331 +PW_MF008_MF041 times 4 dw -F_0_081, -F_0_418 +PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \ + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2 +%include "jccolext-sse2.asm" diff --git a/media/libjpeg/simd/x86_64/jcgray-avx2.asm b/media/libjpeg/simd/x86_64/jcgray-avx2.asm new file mode 100644 index 0000000000..591255bb11 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jcgray-avx2.asm @@ -0,0 +1,113 @@ +; +; jcgray.asm - grayscale colorspace conversion (64-bit AVX2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_gray_convert_avx2) + +EXTN(jconst_rgb_gray_convert_avx2): + +PW_F0299_F0337 times 8 dw F_0_299, F_0_337 +PW_F0114_F0250 times 8 dw F_0_114, F_0_250 +PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2 +%include "jcgryext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2 +%include "jcgryext-avx2.asm" diff --git a/media/libjpeg/simd/x86_64/jcgray-sse2.asm b/media/libjpeg/simd/x86_64/jcgray-sse2.asm new file mode 100644 index 0000000000..e389904f2f --- /dev/null +++ b/media/libjpeg/simd/x86_64/jcgray-sse2.asm @@ -0,0 +1,112 @@ +; +; jcgray.asm - grayscale colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_rgb_gray_convert_sse2) + +EXTN(jconst_rgb_gray_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2 +%include "jcgryext-sse2.asm" diff --git a/media/libjpeg/simd/x86_64/jcgryext-avx2.asm b/media/libjpeg/simd/x86_64/jcgryext-avx2.asm new file mode 100644 index 0000000000..175b60de61 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jcgryext-avx2.asm @@ -0,0 +1,437 @@ +; +; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +; r10d = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13d = JDIMENSION output_row +; r14d = int num_rows + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2) + +EXTN(jsimd_rgb_gray_convert_avx2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d + test rcx, rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rdi + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 + + cmp rcx, byte SIZEOF_YMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, byte [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, word [rsi+rcx] + shl rax, WORD_BIT + or rax, rdx +.column_ld4: + vmovd xmmA, eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + vmovd xmmF, XMM_DWORD [rsi+rcx] + vpslldq xmmA, xmmA, SIZEOF_DWORD + vpor xmmA, xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + vmovq xmmB, XMM_MMWORD [rsi+rcx] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + sub rcx, byte SIZEOF_XMMWORD + vmovdqu xmmB, XMM_MMWORD [rsi+rcx] + vperm2i128 ymmA, ymmA, ymmA, 1 + vpor ymmA, ymmB +.column_ld32: + test cl, SIZEOF_YMMWORD + jz short .column_ld64 + sub rcx, byte SIZEOF_YMMWORD + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] +.column_ld64: + test cl, 2*SIZEOF_YMMWORD + mov rcx, SIZEOF_YMMWORD + jz short .rgb_gray_cnv + vmovdqa ymmB, ymmA + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD] + +.rgb_gray_cnv: + ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vmovdqu ymmC, ymmA + vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + + vmovdqa ymmG, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 + ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) + vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I + ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A + ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) + vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 + ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) + + vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D + ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) + vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F + ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) + + vmovdqa ymmD, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 + ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) + vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P + ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) + vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B + ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) + + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E + ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F + ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) + + vmovdqa ymmE, ymmA + vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C + ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) + vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S + ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) + + vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D + ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) + + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F + ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F + ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) + + vpxor ymmH, ymmH, ymmH + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmB, ymmE + vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + + vmovdqa ymmF, ymmD + vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + vpslldq xmmA, xmmA, SIZEOF_MMWORD + vpor xmmA, xmmA, xmmF +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + vmovdqa xmmF, xmmA + vperm2i128 ymmF, ymmF, ymmF, 1 + vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] + vpor ymmA, ymmA, ymmF +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + jz short .column_ld16 + sub rcx, byte SIZEOF_XMMWORD/2 + vmovdqa ymmF, ymmA + vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld16: + test cl, SIZEOF_XMMWORD + mov rcx, SIZEOF_YMMWORD + jz short .rgb_gray_cnv + vmovdqa ymmE, ymmA + vmovdqa ymmH, ymmF + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] + vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD] + vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD] + +.rgb_gray_cnv: + ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmB, ymmA + vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + + vmovdqa ymmB, ymmF + vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + vmovdqa ymmD, ymmA + vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 + ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) + vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 + ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) + + vmovdqa ymmC, ymmF + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D + ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) + vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F + ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) + + vmovdqa ymmB, ymmA + vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C + ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) + vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D + ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) + + vmovdqa ymmG, ymmD + vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E + ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) + vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F + ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) + + vmovdqa ymmE, ymmA + vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E + ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) + vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E + ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) + + vmovdqa ymmH, ymmB + vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F + ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) + vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F + ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) + + vpxor ymmF, ymmF, ymmF + + vmovdqa ymmC, ymmA + vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) + vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) + + vmovdqa ymmD, ymmB + vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) + vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) + + vmovdqa ymmG, ymmE + vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) + vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) + + vpunpcklbw ymmF, ymmF, ymmH + vpunpckhbw ymmH, ymmH, ymmH + vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) + vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE + ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + vmovdqa ymm6, ymm1 + vpunpcklwd ymm1, ymm1, ymm3 + vpunpckhwd ymm6, ymm6, ymm3 + vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) + vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + vmovdqa ymm6, ymm0 + vpunpcklwd ymm0, ymm0, ymm2 + vpunpckhwd ymm6, ymm6, ymm2 + vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) + vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) + + vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + vmovdqa ymm0, ymm5 ; ymm0=BO + vmovdqa ymm6, ymm4 ; ymm6=BE + + vmovdqa ymm4, ymm0 + vpunpcklwd ymm0, ymm0, ymm3 + vpunpckhwd ymm4, ymm4, ymm3 + vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) + vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF] + + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm4, ymm4, ymm7 + vpaddd ymm0, ymm0, ymm3 + vpaddd ymm4, ymm4, ymm3 + vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH + vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO + + vmovdqa ymm4, ymm6 + vpunpcklwd ymm6, ymm6, ymm2 + vpunpckhwd ymm4, ymm4, ymm2 + vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) + vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF] + + vpaddd ymm6, ymm6, YMMWORD [wk(0)] + vpaddd ymm4, ymm4, YMMWORD [wk(1)] + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm4, ymm4, ymm2 + vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL + vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH + vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE + + vpsllw ymm0, ymm0, BYTE_BIT + vpor ymm6, ymm6, ymm0 ; ymm6=Y + vmovdqu YMMWORD [rdi], ymm6 ; Save Y + + sub rcx, byte SIZEOF_YMMWORD + add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr + add rdi, byte SIZEOF_YMMWORD ; outptr0 + cmp rcx, byte SIZEOF_YMMWORD + jae near .columnloop + test rcx, rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + vzeroupper + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jcgryext-sse2.asm b/media/libjpeg/simd/x86_64/jcgryext-sse2.asm new file mode 100644 index 0000000000..873be80564 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jcgryext-sse2.asm @@ -0,0 +1,362 @@ +; +; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2011, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, +; JSAMPIMAGE output_buf, JDIMENSION output_row, +; int num_rows); +; + +; r10d = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13d = JDIMENSION output_row +; r14d = int num_rows + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2) + +EXTN(jsimd_rgb_gray_convert_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d + test rcx, rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rdi + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 + + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, byte [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, word [rsi+rcx] + shl rax, WORD_BIT + or rax, rdx +.column_ld4: + movd xmmA, eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA, xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF, xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB, xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH, xmmH + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB, xmmE + punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF, xmmD + punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE, xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF, xmmA + movdqa xmmH, xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC, xmmF + punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB, xmmA + punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG, xmmD + punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE, xmmA + punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH, xmmB + punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF, xmmF + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD, xmmB + punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG, xmmE + punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF, xmmH + punpckhbw xmmH, xmmH + psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm6, xmm3 + pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm6, xmm2 + pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 + punpckhwd xmm4, xmm3 + pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0, xmm3 + paddd xmm4, xmm3 + psrld xmm0, SCALEBITS ; xmm0=YOL + psrld xmm4, SCALEBITS ; xmm4=YOH + packssdw xmm0, xmm4 ; xmm0=YO + + movdqa xmm4, xmm6 + punpcklwd xmm6, xmm2 + punpckhwd xmm4, xmm2 + pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6, xmm2 + paddd xmm4, xmm2 + psrld xmm6, SCALEBITS ; xmm6=YEL + psrld xmm4, SCALEBITS ; xmm4=YEH + packssdw xmm6, xmm4 ; xmm6=YE + + psllw xmm0, BYTE_BIT + por xmm6, xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx, rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jchuff-sse2.asm b/media/libjpeg/simd/x86_64/jchuff-sse2.asm new file mode 100644 index 0000000000..7deab582b9 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jchuff-sse2.asm @@ -0,0 +1,345 @@ +; +; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2) +; +; Copyright (C) 2009-2011, 2014-2016, D. R. Commander. +; Copyright (C) 2015, Matthieu Darbois. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation for Huffman coding of one block. +; The following code is based directly on jchuff.c; see jchuff.c for more +; details. + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_huff_encode_one_block) + EXTERN EXTN(jpeg_nbits_table) + +EXTN(jconst_huff_encode_one_block): + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +; These macros perform the same task as the emit_bits() function in the +; original libjpeg code. In addition to reducing overhead by explicitly +; inlining the code, additional performance is achieved by taking into +; account the size of the bit buffer and waiting until it is almost full +; before emptying it. This mostly benefits 64-bit platforms, since 6 +; bytes can be stored in a 64-bit bit buffer before it has to be emptied. + +%macro EMIT_BYTE 0 + sub put_bits, 8 ; put_bits -= 8; + mov rdx, put_buffer + mov ecx, put_bits + shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); + mov byte [buffer], dl ; *buffer++ = c; + add buffer, 1 + cmp dl, 0xFF ; need to stuff a zero byte? + jne %%.EMIT_BYTE_END + mov byte [buffer], 0 ; *buffer++ = 0; + add buffer, 1 +%%.EMIT_BYTE_END: +%endmacro + +%macro PUT_BITS 1 + add put_bits, ecx ; put_bits += size; + shl put_buffer, cl ; put_buffer = (put_buffer << size); + or put_buffer, %1 +%endmacro + +%macro CHECKBUF31 0 + cmp put_bits, 32 ; if (put_bits > 31) { + jl %%.CHECKBUF31_END + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE +%%.CHECKBUF31_END: +%endmacro + +%macro CHECKBUF47 0 + cmp put_bits, 48 ; if (put_bits > 47) { + jl %%.CHECKBUF47_END + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE +%%.CHECKBUF47_END: +%endmacro + +%macro EMIT_BITS 2 + CHECKBUF47 + mov ecx, %2 + PUT_BITS %1 +%endmacro + +%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) + pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128(); + pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128(); + pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128(); + pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128(); + pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; + pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; + pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; + pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; + pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; + pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; + pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; + pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; + pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; + pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; + pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; + pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; + pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; + pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; + pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; + pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; + pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; + pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; + pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; + pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; + pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; + pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; + pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; + pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; + pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; + pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; + pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; + pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; + pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; + pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; + pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; +%if %1 != 32 + pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; +%else + pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31]; +%endif + pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1); + paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg); + paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg); + paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg); + paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg); + pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg); + pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg); + pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg); + pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg); + pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1); + pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1); + pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1); + pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1); + movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); + movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); + movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); + movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); + movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); + movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); + movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); + movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); +%endmacro + +; +; Encode a single block's worth of coefficients. +; +; GLOBAL(JOCTET *) +; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer, +; JCOEFPTR block, int last_dc_val, +; c_derived_tbl *dctbl, c_derived_tbl *actbl) +; + +; r10 = working_state *state +; r11 = JOCTET *buffer +; r12 = JCOEFPTR block +; r13d = int last_dc_val +; r14 = c_derived_tbl *dctbl +; r15 = c_derived_tbl *actbl + +%define t1 rbp - (DCTSIZE2 * SIZEOF_WORD) +%define t2 t1 - (DCTSIZE2 * SIZEOF_WORD) +%define put_buffer r8 +%define put_bits r9d +%define buffer rax + + align 32 + GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) + +EXTN(jsimd_huff_encode_one_block_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [t2] + push_xmm 4 + collect_args 6 + push rbx + + mov buffer, r11 ; r11 is now sratch + + mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer; + mov put_bits, dword [r10+24] ; put_bits = state->cur.put_bits; + push r10 ; r10 is now scratch + + ; Encode the DC coefficient difference per section F.1.2.1 + movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val; + sub edi, r13d ; r13 is not used anymore + mov ebx, edi + + ; This is a well-known technique for obtaining the absolute value + ; without a branch. It is derived from an assembly language technique + ; presented in "How to Optimize for the Pentium Processors", + ; Copyright (c) 1996, 1997 by Agner Fog. + mov esi, edi + sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); + xor edi, esi ; temp ^= temp3; + sub edi, esi ; temp -= temp3; + + ; For a negative input, want temp2 = bitwise complement of abs(input) + ; This code assumes we are on a two's complement machine + add ebx, esi ; temp2 += temp3; + + ; Find the number of bits needed for the magnitude of the coefficient + lea r11, [rel EXTN(jpeg_nbits_table)] + movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp); + ; Emit the Huffman-coded symbol for the number of bits + mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits]; + movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits]; + EMIT_BITS r11, esi ; EMIT_BITS(code, size) + + ; Mask off any extra bits in code + mov esi, 1 + mov ecx, edi + shl esi, cl + dec esi + and ebx, esi ; temp2 &= (((JLONG)1)<<nbits) - 1; + + ; Emit that number of bits of the value, if positive, + ; or the complement of its magnitude, if negative. + EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits) + + ; Prepare data + xor ebx, ebx + kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \ + 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \ + 27, 20, 13, 6, 7, 14, 21, 28, 35, \ + xmm0, xmm1, xmm2, xmm3 + kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \ + 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \ + 53, 60, 61, 54, 47, 55, 62, 63, 63, \ + xmm4, xmm5, xmm6, xmm7 + + pxor xmm8, xmm8 + pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); + pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); + pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); + pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); + pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero); + pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero); + pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero); + pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero); + packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); + packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); + packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5); + packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7); + pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; + pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; + pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32; + pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48; + shl r12, 16 + shl r14, 16 + or r11, r12 + or r13, r14 + shl r13, 32 + or r11, r13 + not r11 ; index = ~index; + + ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11 + ;jmp .EFN + + mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; + movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; + lea rsi, [t1] +.BLOOP: + bsf r12, r11 ; r = __builtin_ctzl(index); + jz .ELOOP + mov rcx, r12 + lea rsi, [rsi+r12*2] ; k += r; + shr r11, cl ; index >>= r; + movzx rdi, word [rsi] ; temp = t1[k]; + lea rbx, [rel EXTN(jpeg_nbits_table)] + movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp); +.BRLOOP: + cmp r12, 16 ; while (r > 15) { + jl .ERLOOP + EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0) + sub r12, 16 ; r -= 16; + jmp .BRLOOP +.ERLOOP: + ; Emit Huffman symbol for run length / number of bits + CHECKBUF31 ; uses rcx, rdx + + shl r12, 4 ; temp3 = (r << 4) + nbits; + add r12, rdi + mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3]; + movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3]; + PUT_BITS rbx + + ;EMIT_CODE(code, size) + + movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k]; + ; Mask off any extra bits in code + mov rcx, rdi + mov rdx, 1 + shl rdx, cl + dec rdx + and rbx, rdx ; temp2 &= (((JLONG)1)<<nbits) - 1; + PUT_BITS rbx ; PUT_BITS(temp2, nbits) + + shr r11, 1 ; index >>= 1; + add rsi, 2 ; ++k; + jmp .BLOOP +.ELOOP: + ; If the last coef(s) were zero, emit an end-of-block code + lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; + cmp rdi, rsi ; if (r > 0) { + je .EFN + mov ebx, INT [r15] ; code = actbl->ehufco[0]; + movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0]; + EMIT_BITS rbx, r12d +.EFN: + pop r10 + ; Save put_buffer & put_bits + mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer; + mov dword [r10+24], put_bits ; state->cur.put_bits = put_bits; + + pop rbx + uncollect_args 6 + pop_xmm 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jcphuff-sse2.asm b/media/libjpeg/simd/x86_64/jcphuff-sse2.asm new file mode 100644 index 0000000000..8ed44728fe --- /dev/null +++ b/media/libjpeg/simd/x86_64/jcphuff-sse2.asm @@ -0,0 +1,637 @@ +; +; jcphuff-sse2.asm - prepare data for progressive Huffman encoding +; (64-bit SSE2) +; +; Copyright (C) 2016, 2018, Matthieu Darbois +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation of data preparation for progressive +; Huffman encoding. See jcphuff.c for more details. + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +; -------------------------------------------------------------------------- +; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and +; jsimd_encode_mcu_AC_refine_prepare_sse2() + +%macro LOAD16 0 + pxor N0, N0 + pxor N1, N1 + + mov T0d, INT [LUT + 0*SIZEOF_INT] + mov T1d, INT [LUT + 8*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + pinsrw X1, word [BLOCK + T1 * 2], 0 + + mov T0d, INT [LUT + 1*SIZEOF_INT] + mov T1d, INT [LUT + 9*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + pinsrw X1, word [BLOCK + T1 * 2], 1 + + mov T0d, INT [LUT + 2*SIZEOF_INT] + mov T1d, INT [LUT + 10*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + pinsrw X1, word [BLOCK + T1 * 2], 2 + + mov T0d, INT [LUT + 3*SIZEOF_INT] + mov T1d, INT [LUT + 11*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + pinsrw X1, word [BLOCK + T1 * 2], 3 + + mov T0d, INT [LUT + 4*SIZEOF_INT] + mov T1d, INT [LUT + 12*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + pinsrw X1, word [BLOCK + T1 * 2], 4 + + mov T0d, INT [LUT + 5*SIZEOF_INT] + mov T1d, INT [LUT + 13*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + pinsrw X1, word [BLOCK + T1 * 2], 5 + + mov T0d, INT [LUT + 6*SIZEOF_INT] + mov T1d, INT [LUT + 14*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + pinsrw X1, word [BLOCK + T1 * 2], 6 + + mov T0d, INT [LUT + 7*SIZEOF_INT] + mov T1d, INT [LUT + 15*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 + pinsrw X1, word [BLOCK + T1 * 2], 7 +%endmacro + +%macro LOAD15 0 + pxor N0, N0 + pxor N1, N1 + pxor X1, X1 + + mov T0d, INT [LUT + 0*SIZEOF_INT] + mov T1d, INT [LUT + 8*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + pinsrw X1, word [BLOCK + T1 * 2], 0 + + mov T0d, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + + mov T0d, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + + mov T0d, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + + mov T0d, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + + mov T0d, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + + mov T0d, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + + mov T0d, INT [LUT + 7*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 + + cmp LENEND, 2 + jl %%.ELOAD15 + mov T1d, INT [LUT + 9*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 1 + + cmp LENEND, 3 + jl %%.ELOAD15 + mov T1d, INT [LUT + 10*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 2 + + cmp LENEND, 4 + jl %%.ELOAD15 + mov T1d, INT [LUT + 11*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 3 + + cmp LENEND, 5 + jl %%.ELOAD15 + mov T1d, INT [LUT + 12*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 4 + + cmp LENEND, 6 + jl %%.ELOAD15 + mov T1d, INT [LUT + 13*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 5 + + cmp LENEND, 7 + jl %%.ELOAD15 + mov T1d, INT [LUT + 14*SIZEOF_INT] + pinsrw X1, word [BLOCK + T1 * 2], 6 +%%.ELOAD15: +%endmacro + +%macro LOAD8 0 + pxor N0, N0 + + mov T0d, INT [LUT + 0*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 0 + + mov T0d, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 1 + + mov T0d, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 2 + + mov T0d, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 3 + + mov T0d, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 4 + + mov T0d, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 5 + + mov T0d, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 6 + + mov T0d, INT [LUT + 7*SIZEOF_INT] + pinsrw X0, word [BLOCK + T0 * 2], 7 +%endmacro + +%macro LOAD7 0 + pxor N0, N0 + pxor X0, X0 + + mov T1d, INT [LUT + 0*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 0 + + cmp LENEND, 2 + jl %%.ELOAD7 + mov T1d, INT [LUT + 1*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 1 + + cmp LENEND, 3 + jl %%.ELOAD7 + mov T1d, INT [LUT + 2*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 2 + + cmp LENEND, 4 + jl %%.ELOAD7 + mov T1d, INT [LUT + 3*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 3 + + cmp LENEND, 5 + jl %%.ELOAD7 + mov T1d, INT [LUT + 4*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 4 + + cmp LENEND, 6 + jl %%.ELOAD7 + mov T1d, INT [LUT + 5*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 5 + + cmp LENEND, 7 + jl %%.ELOAD7 + mov T1d, INT [LUT + 6*SIZEOF_INT] + pinsrw X0, word [BLOCK + T1 * 2], 6 +%%.ELOAD7: +%endmacro + +%macro REDUCE0 0 + movdqa xmm0, XMMWORD [VALUES + ( 0*2)] + movdqa xmm1, XMMWORD [VALUES + ( 8*2)] + movdqa xmm2, XMMWORD [VALUES + (16*2)] + movdqa xmm3, XMMWORD [VALUES + (24*2)] + movdqa xmm4, XMMWORD [VALUES + (32*2)] + movdqa xmm5, XMMWORD [VALUES + (40*2)] + movdqa xmm6, XMMWORD [VALUES + (48*2)] + movdqa xmm7, XMMWORD [VALUES + (56*2)] + + pcmpeqw xmm0, ZERO + pcmpeqw xmm1, ZERO + pcmpeqw xmm2, ZERO + pcmpeqw xmm3, ZERO + pcmpeqw xmm4, ZERO + pcmpeqw xmm5, ZERO + pcmpeqw xmm6, ZERO + pcmpeqw xmm7, ZERO + + packsswb xmm0, xmm1 + packsswb xmm2, xmm3 + packsswb xmm4, xmm5 + packsswb xmm6, xmm7 + + pmovmskb eax, xmm0 + pmovmskb ecx, xmm2 + pmovmskb edx, xmm4 + pmovmskb esi, xmm6 + + shl rcx, 16 + shl rdx, 32 + shl rsi, 48 + + or rax, rcx + or rdx, rsi + or rax, rdx + + not rax + + mov MMWORD [r15], rax +%endmacro + +; +; Prepare data for jsimd_encode_mcu_AC_first(). +; +; GLOBAL(void) +; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, +; const int *jpeg_natural_order_start, +; int Sl, int Al, JCOEF *values, +; size_t *zerobits) +; +; r10 = const JCOEF *block +; r11 = const int *jpeg_natural_order_start +; r12 = int Sl +; r13 = int Al +; r14 = JCOEF *values +; r15 = size_t *zerobits + +%define ZERO xmm9 +%define X0 xmm0 +%define X1 xmm1 +%define N0 xmm2 +%define N1 xmm3 +%define AL xmm4 +%define K eax +%define LUT r11 +%define T0 rcx +%define T0d ecx +%define T1 rdx +%define T1d edx +%define BLOCK r10 +%define VALUES r14 +%define LEN r12d +%define LENEND r13d + + align 32 + GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) + +EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [rbp - 16] + collect_args 6 + + movdqa XMMWORD [rbp - 16], ZERO + + movd AL, r13d + pxor ZERO, ZERO + mov K, LEN + mov LENEND, LEN + and K, -16 + and LENEND, 7 + shr K, 4 + jz .ELOOP16 +.BLOOP16: + LOAD16 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + pxor N0, X0 + pxor N1, X1 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 + add VALUES, 16*2 + add LUT, 16*SIZEOF_INT + dec K + jnz .BLOOP16 + test LEN, 15 + je .PADDING +.ELOOP16: + test LEN, 8 + jz .TRY7 + test LEN, 7 + jz .TRY8 + + LOAD15 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + pxor N0, X0 + pxor N1, X1 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 + add VALUES, 16*2 + jmp .PADDING +.TRY8: + LOAD8 + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + pxor N0, X0 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + add VALUES, 8*2 + jmp .PADDING +.TRY7: + LOAD7 + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + pxor N0, X0 + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 + add VALUES, 8*2 +.PADDING: + mov K, LEN + add K, 7 + and K, -8 + shr K, 3 + sub K, DCTSIZE2/8 + jz .EPADDING + align 16 +.ZEROLOOP: + movdqa XMMWORD [VALUES + 0], ZERO + add VALUES, 8*2 + inc K + jnz .ZEROLOOP +.EPADDING: + sub VALUES, DCTSIZE2*2 + + REDUCE0 + + movdqa ZERO, XMMWORD [rbp - 16] + uncollect_args 6 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +%undef ZERO +%undef X0 +%undef X1 +%undef N0 +%undef N1 +%undef AL +%undef K +%undef LUT +%undef T0 +%undef T0d +%undef T1 +%undef T1d +%undef BLOCK +%undef VALUES +%undef LEN +%undef LENEND + +; +; Prepare data for jsimd_encode_mcu_AC_refine(). +; +; GLOBAL(int) +; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, +; const int *jpeg_natural_order_start, +; int Sl, int Al, JCOEF *absvalues, +; size_t *bits) +; +; r10 = const JCOEF *block +; r11 = const int *jpeg_natural_order_start +; r12 = int Sl +; r13 = int Al +; r14 = JCOEF *values +; r15 = size_t *bits + +%define ZERO xmm9 +%define ONE xmm5 +%define X0 xmm0 +%define X1 xmm1 +%define N0 xmm2 +%define N1 xmm3 +%define AL xmm4 +%define K eax +%define KK r9d +%define EOB r8d +%define SIGN rdi +%define LUT r11 +%define T0 rcx +%define T0d ecx +%define T1 rdx +%define T1d edx +%define BLOCK r10 +%define VALUES r14 +%define LEN r12d +%define LENEND r13d + + align 32 + GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) + +EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [rbp - 16] + collect_args 6 + + movdqa XMMWORD [rbp - 16], ZERO + + xor SIGN, SIGN + xor EOB, EOB + xor KK, KK + movd AL, r13d + pxor ZERO, ZERO + pcmpeqw ONE, ONE + psrlw ONE, 15 + mov K, LEN + mov LENEND, LEN + and K, -16 + and LENEND, 7 + shr K, 4 + jz .ELOOPR16 +.BLOOPR16: + LOAD16 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + pcmpeqw X0, ONE + pcmpeqw X1, ONE + packsswb N0, N1 + packsswb X0, X1 + pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); + shr SIGN, 16 ; make room for sizebits + shl T0, 48 + or SIGN, T0 + bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER16 ; if (idx) { + mov EOB, KK + add EOB, T1d ; EOB = k + idx; +.CONTINUER16: + add VALUES, 16*2 + add LUT, 16*SIZEOF_INT + add KK, 16 + dec K + jnz .BLOOPR16 +.ELOOPR16: + test LEN, 8 + jz .TRYR7 + test LEN, 7 + jz .TRYR8 + + LOAD15 + pcmpgtw N0, X0 + pcmpgtw N1, X1 + paddw X0, N0 + paddw X1, N1 + pxor X0, N0 + pxor X1, N1 + psrlw X0, AL + psrlw X1, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + movdqa XMMWORD [VALUES + (8) * 2], X1 + pcmpeqw X0, ONE + pcmpeqw X1, ONE + packsswb N0, N1 + packsswb X0, X1 + pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); + shr SIGN, 16 ; make room for sizebits + shl T0, 48 + or SIGN, T0 + bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER15 ; if (idx) { + mov EOB, KK + add EOB, T1d ; EOB = k + idx; +.CONTINUER15: + add VALUES, 16*2 + jmp .PADDINGR +.TRYR8: + LOAD8 + + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + pcmpeqw X0, ONE + packsswb N0, ZERO + packsswb X0, ZERO + pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); + shr SIGN, 8 ; make room for sizebits + shl T0, 56 + or SIGN, T0 + bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER8 ; if (idx) { + mov EOB, KK + add EOB, T1d ; EOB = k + idx; +.CONTINUER8: + add VALUES, 8*2 + jmp .PADDINGR +.TRYR7: + LOAD7 + + pcmpgtw N0, X0 + paddw X0, N0 + pxor X0, N0 + psrlw X0, AL + movdqa XMMWORD [VALUES + (0) * 2], X0 + pcmpeqw X0, ONE + packsswb N0, ZERO + packsswb X0, ZERO + pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); + pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); + shr SIGN, 8 ; make room for sizebits + shl T0, 56 + or SIGN, T0 + bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); + jz .CONTINUER7 ; if (idx) { + mov EOB, KK + add EOB, T1d ; EOB = k + idx; +.CONTINUER7: + add VALUES, 8*2 +.PADDINGR: + mov K, LEN + add K, 7 + and K, -8 + shr K, 3 + sub K, DCTSIZE2/8 + jz .EPADDINGR + align 16 +.ZEROLOOPR: + movdqa XMMWORD [VALUES + 0], ZERO + shr SIGN, 8 + add VALUES, 8*2 + inc K + jnz .ZEROLOOPR +.EPADDINGR: + not SIGN + sub VALUES, DCTSIZE2*2 + mov MMWORD [r15+SIZEOF_MMWORD], SIGN + + REDUCE0 + + mov eax, EOB + movdqa ZERO, XMMWORD [rbp - 16] + uncollect_args 6 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +%undef ZERO +%undef ONE +%undef X0 +%undef X1 +%undef N0 +%undef N1 +%undef AL +%undef K +%undef KK +%undef EOB +%undef SIGN +%undef LUT +%undef T0 +%undef T0d +%undef T1 +%undef T1d +%undef BLOCK +%undef VALUES +%undef LEN +%undef LENEND + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jcsample-avx2.asm b/media/libjpeg/simd/x86_64/jcsample-avx2.asm new file mode 100644 index 0000000000..d9922bb4cb --- /dev/null +++ b/media/libjpeg/simd/x86_64/jcsample-avx2.asm @@ -0,0 +1,366 @@ +; +; jcsample.asm - downsampling (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +; r10d = JDIMENSION image_width +; r11 = int max_v_samp_factor +; r12d = JDIMENSION v_samp_factor +; r13d = JDIMENSION width_in_blocks +; r14 = JSAMPARRAY input_data +; r15 = JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2) + +EXTN(jsimd_h2v1_downsample_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 6 + + mov ecx, r13d + shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return + + mov edx, r10d + + ; -- expand_right_edge + + push rcx + shl rcx, 1 ; output_cols * 2 + sub rcx, rdx + jle short .expand_end + + mov rax, r11 + test rax, rax + jle short .expand_end + + cld + mov rsi, r14 ; input_data +.expandloop: + push rax + push rcx + + mov rdi, JSAMPROW [rsi] + add rdi, rdx + mov al, JSAMPLE [rdi-1] + + rep stosb + + pop rcx + pop rax + + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop + +.expand_end: + pop rcx ; output_cols + + ; -- h2v1_downsample + + mov eax, r12d ; rowctr + test eax, eax + jle near .return + + mov rdx, 0x00010000 ; bias pattern + vmovd xmm7, edx + vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7} + vpcmpeqw ymm6, ymm6, ymm6 + vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} + + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data +.rowloop: + push rcx + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + + cmp rcx, byte SIZEOF_YMMWORD + jae short .columnloop + +.columnloop_r24: + ; rcx can possibly be 8, 16, 24 + cmp rcx, 24 + jne .columnloop_r16 + vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD] + mov rcx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r16: + cmp rcx, 16 + jne .columnloop_r8 + vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vpxor ymm1, ymm1, ymm1 + mov rcx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r8: + vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD] + vpxor ymm1, ymm1, ymm1 + mov rcx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop: + vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD] + +.downsample: + vpsrlw ymm2, ymm0, BYTE_BIT + vpand ymm0, ymm0, ymm6 + vpsrlw ymm3, ymm1, BYTE_BIT + vpand ymm1, ymm1, ymm6 + + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm1, ymm1, ymm7 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 + + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 + + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 + + sub rcx, byte SIZEOF_YMMWORD ; outcol + add rsi, byte 2*SIZEOF_YMMWORD ; inptr + add rdi, byte 1*SIZEOF_YMMWORD ; outptr + cmp rcx, byte SIZEOF_YMMWORD + jae short .columnloop + test rcx, rcx + jnz near .columnloop_r24 + + pop rsi + pop rdi + pop rcx + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop + +.return: + vzeroupper + uncollect_args 6 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +; r10d = JDIMENSION image_width +; r11 = int max_v_samp_factor +; r12d = JDIMENSION v_samp_factor +; r13d = JDIMENSION width_in_blocks +; r14 = JSAMPARRAY input_data +; r15 = JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2) + +EXTN(jsimd_h2v2_downsample_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 6 + + mov ecx, r13d + shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return + + mov edx, r10d + + ; -- expand_right_edge + + push rcx + shl rcx, 1 ; output_cols * 2 + sub rcx, rdx + jle short .expand_end + + mov rax, r11 + test rax, rax + jle short .expand_end + + cld + mov rsi, r14 ; input_data +.expandloop: + push rax + push rcx + + mov rdi, JSAMPROW [rsi] + add rdi, rdx + mov al, JSAMPLE [rdi-1] + + rep stosb + + pop rcx + pop rax + + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop + +.expand_end: + pop rcx ; output_cols + + ; -- h2v2_downsample + + mov eax, r12d ; rowctr + test rax, rax + jle near .return + + mov rdx, 0x00020001 ; bias pattern + vmovd xmm7, edx + vpcmpeqw ymm6, ymm6, ymm6 + vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2} + vperm2i128 ymm7, ymm7, ymm7, 0 + vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} + + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data +.rowloop: + push rcx + push rdi + push rsi + + mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 + mov rdi, JSAMPROW [rdi] ; outptr + + cmp rcx, byte SIZEOF_YMMWORD + jae short .columnloop + +.columnloop_r24: + cmp rcx, 24 + jne .columnloop_r16 + vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD] + vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD] + mov rcx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r16: + cmp rcx, 16 + jne .columnloop_r8 + vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vpxor ymm2, ymm2, ymm2 + vpxor ymm3, ymm3, ymm3 + mov rcx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop_r8: + vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + vpxor ymm2, ymm2, ymm2 + vpxor ymm3, ymm3, ymm3 + mov rcx, SIZEOF_YMMWORD + jmp short .downsample + +.columnloop: + vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] + vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] + vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD] + vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD] + +.downsample: + vpand ymm4, ymm0, ymm6 + vpsrlw ymm0, ymm0, BYTE_BIT + vpand ymm5, ymm1, ymm6 + vpsrlw ymm1, ymm1, BYTE_BIT + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + + vpand ymm4, ymm2, ymm6 + vpsrlw ymm2, ymm2, BYTE_BIT + vpand ymm5, ymm3, ymm6 + vpsrlw ymm3, ymm3, BYTE_BIT + vpaddw ymm2, ymm2, ymm4 + vpaddw ymm3, ymm3, ymm5 + + vpaddw ymm0, ymm0, ymm1 + vpaddw ymm2, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm2, ymm2, ymm7 + vpsrlw ymm0, ymm0, 2 + vpsrlw ymm2, ymm2, 2 + + vpackuswb ymm0, ymm0, ymm2 + vpermq ymm0, ymm0, 0xd8 + + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 + + sub rcx, byte SIZEOF_YMMWORD ; outcol + add rdx, byte 2*SIZEOF_YMMWORD ; inptr0 + add rsi, byte 2*SIZEOF_YMMWORD ; inptr1 + add rdi, byte 1*SIZEOF_YMMWORD ; outptr + cmp rcx, byte SIZEOF_YMMWORD + jae near .columnloop + test rcx, rcx + jnz near .columnloop_r24 + + pop rsi + pop rdi + pop rcx + + add rsi, byte 2*SIZEOF_JSAMPROW ; input_data + add rdi, byte 1*SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop + +.return: + vzeroupper + uncollect_args 6 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jcsample-sse2.asm b/media/libjpeg/simd/x86_64/jcsample-sse2.asm new file mode 100644 index 0000000000..0f107e9a07 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jcsample-sse2.asm @@ -0,0 +1,329 @@ +; +; jcsample.asm - downsampling (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +; r10d = JDIMENSION image_width +; r11 = int max_v_samp_factor +; r12d = JDIMENSION v_samp_factor +; r13d = JDIMENSION width_in_blocks +; r14 = JSAMPARRAY input_data +; r15 = JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2) + +EXTN(jsimd_h2v1_downsample_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 6 + + mov ecx, r13d + shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return + + mov edx, r10d + + ; -- expand_right_edge + + push rcx + shl rcx, 1 ; output_cols * 2 + sub rcx, rdx + jle short .expand_end + + mov rax, r11 + test rax, rax + jle short .expand_end + + cld + mov rsi, r14 ; input_data +.expandloop: + push rax + push rcx + + mov rdi, JSAMPROW [rsi] + add rdi, rdx + mov al, JSAMPLE [rdi-1] + + rep stosb + + pop rcx + pop rax + + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop + +.expand_end: + pop rcx ; output_cols + + ; -- h2v1_downsample + + mov eax, r12d ; rowctr + test eax, eax + jle near .return + + mov rdx, 0x00010000 ; bias pattern + movd xmm7, edx + pcmpeqw xmm6, xmm6 + pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data +.rowloop: + push rcx + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + +.columnloop_r8: + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm1, xmm1 + mov rcx, SIZEOF_XMMWORD + jmp short .downsample + +.columnloop: + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + pand xmm0, xmm6 + psrlw xmm2, BYTE_BIT + pand xmm1, xmm6 + psrlw xmm3, BYTE_BIT + + paddw xmm0, xmm2 + paddw xmm1, xmm3 + paddw xmm0, xmm7 + paddw xmm1, xmm7 + psrlw xmm0, 1 + psrlw xmm1, 1 + + packuswb xmm0, xmm1 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + + sub rcx, byte SIZEOF_XMMWORD ; outcol + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rdi, byte 1*SIZEOF_XMMWORD ; outptr + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + test rcx, rcx + jnz short .columnloop_r8 + + pop rsi + pop rdi + pop rcx + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop + +.return: + uncollect_args 6 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, +; JDIMENSION width_in_blocks, JSAMPARRAY input_data, +; JSAMPARRAY output_data); +; + +; r10d = JDIMENSION image_width +; r11 = int max_v_samp_factor +; r12d = JDIMENSION v_samp_factor +; r13d = JDIMENSION width_in_blocks +; r14 = JSAMPARRAY input_data +; r15 = JSAMPARRAY output_data + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2) + +EXTN(jsimd_h2v2_downsample_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 6 + + mov ecx, r13d + shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return + + mov edx, r10d + + ; -- expand_right_edge + + push rcx + shl rcx, 1 ; output_cols * 2 + sub rcx, rdx + jle short .expand_end + + mov rax, r11 + test rax, rax + jle short .expand_end + + cld + mov rsi, r14 ; input_data +.expandloop: + push rax + push rcx + + mov rdi, JSAMPROW [rsi] + add rdi, rdx + mov al, JSAMPLE [rdi-1] + + rep stosb + + pop rcx + pop rax + + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop + +.expand_end: + pop rcx ; output_cols + + ; -- h2v2_downsample + + mov eax, r12d ; rowctr + test rax, rax + jle near .return + + mov rdx, 0x00020001 ; bias pattern + movd xmm7, edx + pcmpeqw xmm6, xmm6 + pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data +.rowloop: + push rcx + push rdi + push rsi + + mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 + mov rdi, JSAMPROW [rdi] ; outptr + + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + +.columnloop_r8: + movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm2, xmm2 + pxor xmm3, xmm3 + mov rcx, SIZEOF_XMMWORD + jmp short .downsample + +.columnloop: + movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + pand xmm0, xmm6 + psrlw xmm4, BYTE_BIT + pand xmm1, xmm6 + psrlw xmm5, BYTE_BIT + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movdqa xmm4, xmm2 + movdqa xmm5, xmm3 + pand xmm2, xmm6 + psrlw xmm4, BYTE_BIT + pand xmm3, xmm6 + psrlw xmm5, BYTE_BIT + paddw xmm2, xmm4 + paddw xmm3, xmm5 + + paddw xmm0, xmm1 + paddw xmm2, xmm3 + paddw xmm0, xmm7 + paddw xmm2, xmm7 + psrlw xmm0, 2 + psrlw xmm2, 2 + + packuswb xmm0, xmm2 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + + sub rcx, byte SIZEOF_XMMWORD ; outcol + add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 + add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 + add rdi, byte 1*SIZEOF_XMMWORD ; outptr + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx, rcx + jnz near .columnloop_r8 + + pop rsi + pop rdi + pop rcx + + add rsi, byte 2*SIZEOF_JSAMPROW ; input_data + add rdi, byte 1*SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop + +.return: + uncollect_args 6 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jdcolext-avx2.asm b/media/libjpeg/simd/x86_64/jdcolext-avx2.asm new file mode 100644 index 0000000000..677b8ed84e --- /dev/null +++ b/media/libjpeg/simd/x86_64/jdcolext-avx2.asm @@ -0,0 +1,495 @@ +; +; jdcolext.asm - colorspace conversion (64-bit AVX2) +; +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf, +; JDIMENSION input_row, JSAMPARRAY output_buf, +; int num_rows) +; + +; r10d = JDIMENSION out_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION input_row +; r13 = JSAMPARRAY output_buf +; r14d = int num_rows + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2) + +EXTN(jsimd_ycc_rgb_convert_avx2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d ; num_cols + test rcx, rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rdi, r13 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rax + push rdi + push rdx + push rbx + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr0 + mov rbx, JSAMPROW [rbx] ; inptr1 + mov rdx, JSAMPROW [rdx] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr +.columnloop: + + vmovdqu ymm5, YMMWORD [rbx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) + vmovdqu ymm1, YMMWORD [rdx] ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpcmpeqw ymm0, ymm0, ymm0 + vpcmpeqw ymm7, ymm7, ymm7 + vpsrlw ymm0, ymm0, BYTE_BIT ; ymm0={0xFF 0x00 0xFF 0x00 ..} + vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpand ymm4, ymm0, ymm5 ; ymm4=Cb(02468ACEGIKMOQSU)=CbE + vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Cb(13579BDFHJLNPRTV)=CbO + vpand ymm0, ymm0, ymm1 ; ymm0=Cr(02468ACEGIKMOQSU)=CrE + vpsrlw ymm1, ymm1, BYTE_BIT ; ymm1=Cr(13579BDFHJLNPRTV)=CrO + + vpaddw ymm2, ymm4, ymm7 + vpaddw ymm3, ymm5, ymm7 + vpaddw ymm6, ymm0, ymm7 + vpaddw ymm7, ymm1, ymm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbE + vpaddw ymm5, ymm3, ymm3 ; ymm5=2*CbO + vpaddw ymm0, ymm6, ymm6 ; ymm0=2*CrE + vpaddw ymm1, ymm7, ymm7 ; ymm1=2*CrO + + vpmulhw ymm4, ymm4, [rel PW_MF0228] ; ymm4=(2*CbE * -FIX(0.22800)) + vpmulhw ymm5, ymm5, [rel PW_MF0228] ; ymm5=(2*CbO * -FIX(0.22800)) + vpmulhw ymm0, ymm0, [rel PW_F0402] ; ymm0=(2*CrE * FIX(0.40200)) + vpmulhw ymm1, ymm1, [rel PW_F0402] ; ymm1=(2*CrO * FIX(0.40200)) + + vpaddw ymm4, ymm4, [rel PW_ONE] + vpaddw ymm5, ymm5, [rel PW_ONE] + vpsraw ymm4, ymm4, 1 ; ymm4=(CbE * -FIX(0.22800)) + vpsraw ymm5, ymm5, 1 ; ymm5=(CbO * -FIX(0.22800)) + vpaddw ymm0, ymm0, [rel PW_ONE] + vpaddw ymm1, ymm1, [rel PW_ONE] + vpsraw ymm0, ymm0, 1 ; ymm0=(CrE * FIX(0.40200)) + vpsraw ymm1, ymm1, 1 ; ymm1=(CrO * FIX(0.40200)) + + vpaddw ymm4, ymm4, ymm2 + vpaddw ymm5, ymm5, ymm3 + vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbE * FIX(1.77200))=(B-Y)E + vpaddw ymm5, ymm5, ymm3 ; ymm5=(CbO * FIX(1.77200))=(B-Y)O + vpaddw ymm0, ymm0, ymm6 ; ymm0=(CrE * FIX(1.40200))=(R-Y)E + vpaddw ymm1, ymm1, ymm7 ; ymm1=(CrO * FIX(1.40200))=(R-Y)O + + vmovdqa YMMWORD [wk(0)], ymm4 ; wk(0)=(B-Y)E + vmovdqa YMMWORD [wk(1)], ymm5 ; wk(1)=(B-Y)O + + vpunpckhwd ymm4, ymm2, ymm6 + vpunpcklwd ymm2, ymm2, ymm6 + vpmaddwd ymm2, ymm2, [rel PW_MF0344_F0285] + vpmaddwd ymm4, ymm4, [rel PW_MF0344_F0285] + vpunpckhwd ymm5, ymm3, ymm7 + vpunpcklwd ymm3, ymm3, ymm7 + vpmaddwd ymm3, ymm3, [rel PW_MF0344_F0285] + vpmaddwd ymm5, ymm5, [rel PW_MF0344_F0285] + + vpaddd ymm2, ymm2, [rel PD_ONEHALF] + vpaddd ymm4, ymm4, [rel PD_ONEHALF] + vpsrad ymm2, ymm2, SCALEBITS + vpsrad ymm4, ymm4, SCALEBITS + vpaddd ymm3, ymm3, [rel PD_ONEHALF] + vpaddd ymm5, ymm5, [rel PD_ONEHALF] + vpsrad ymm3, ymm3, SCALEBITS + vpsrad ymm5, ymm5, SCALEBITS + + vpackssdw ymm2, ymm2, ymm4 ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + vpackssdw ymm3, ymm3, ymm5 ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + vpsubw ymm2, ymm2, ymm6 ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + vpsubw ymm3, ymm3, ymm7 ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + vmovdqu ymm5, YMMWORD [rsi] ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpcmpeqw ymm4, ymm4, ymm4 + vpsrlw ymm4, ymm4, BYTE_BIT ; ymm4={0xFF 0x00 0xFF 0x00 ..} + vpand ymm4, ymm4, ymm5 ; ymm4=Y(02468ACEGIKMOQSU)=YE + vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Y(13579BDFHJLNPRTV)=YO + + vpaddw ymm0, ymm0, ymm4 ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU) + vpaddw ymm1, ymm1, ymm5 ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV) + vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********) + vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********) + + vpaddw ymm2, ymm2, ymm4 ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU) + vpaddw ymm3, ymm3, ymm5 ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV) + vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********) + vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********) + + vpaddw ymm4, ymm4, YMMWORD [wk(0)] ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU) + vpaddw ymm5, ymm5, YMMWORD [wk(1)] ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV) + vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********) + vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F + ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V) + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F + ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V) + + vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G + ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --) + vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F + ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07 + ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N) + + vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H + ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --) + + vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H + ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --) + vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G + ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --) + vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18 + ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O) + + vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H + ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --) + vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29 + ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P) + + vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03 + ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J) + vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14 + ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K) + vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29 + ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P) + vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07 + ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N) + + vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B + ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R) + vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C + ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S) + vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H + ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --) + vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V) + + vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test rdi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_YMMWORD ; inptr0 + add rbx, byte SIZEOF_YMMWORD ; inptr1 + add rdx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + +.column_st64: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_YMMWORD + jb short .column_st32 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmF + sub rcx, byte 2*SIZEOF_YMMWORD + jmp short .column_st31 +.column_st32: + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st31 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + add rdi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub rcx, byte SIZEOF_YMMWORD + jmp short .column_st31 +.column_st31: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + vmovq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + vpsrldq xmmA, xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + vmovd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + vpsrldq xmmA, xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + vmovd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov word [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + mov byte [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%else + vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%endif + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **) + ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E + ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U) + vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F + ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V) + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F + ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V) + + vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E + ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36 + ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M) + vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F + ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V) + vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37 + ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N) + + vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + + vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test rdi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC + vmovntdq YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC + vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH +.out0: + add rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_YMMWORD ; inptr0 + add rbx, byte SIZEOF_YMMWORD ; inptr1 + add rdx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + +.column_st64: + cmp rcx, byte SIZEOF_YMMWORD/2 + jb short .column_st32 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmC + vmovdqa ymmD, ymmH + sub rcx, byte SIZEOF_YMMWORD/2 +.column_st32: + cmp rcx, byte SIZEOF_YMMWORD/4 + jb short .column_st16 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + add rdi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub rcx, byte SIZEOF_YMMWORD/4 +.column_st16: + cmp rcx, byte SIZEOF_YMMWORD/8 + jb short .column_st15 + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + vperm2i128 ymmA, ymmA, ymmA, 1 + add rdi, byte SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD/8 +.column_st15: + ; Store two pixels (8 bytes) of ymmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_YMMWORD/16 + jb short .column_st7 + vmovq MMWORD [rdi], xmmA + add rdi, byte SIZEOF_YMMWORD/16*4 + sub rcx, byte SIZEOF_YMMWORD/16 + vpsrldq xmmA, SIZEOF_YMMWORD/16*4 +.column_st7: + ; Store one pixel (4 bytes) of ymmA to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + vmovd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.nextrow: + pop rcx + pop rsi + pop rbx + pop rdx + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + add rdi, byte SIZEOF_JSAMPROW ; output_buf + dec rax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + pop rbx + vzeroupper + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jdcolext-sse2.asm b/media/libjpeg/simd/x86_64/jdcolext-sse2.asm new file mode 100644 index 0000000000..071aa62913 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jdcolext-sse2.asm @@ -0,0 +1,438 @@ +; +; jdcolext.asm - colorspace conversion (64-bit SSE2) +; +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf, +; JDIMENSION input_row, JSAMPARRAY output_buf, +; int num_rows) +; + +; r10d = JDIMENSION out_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION input_row +; r13 = JSAMPARRAY output_buf +; r14d = int num_rows + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2) + +EXTN(jsimd_ycc_rgb_convert_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d ; num_cols + test rcx, rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rdi, r13 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rax + push rdi + push rdx + push rbx + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr0 + mov rbx, JSAMPROW [rbx] ; inptr1 + mov rdx, JSAMPROW [rdx] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr +.columnloop: + + movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4, xmm4 + pcmpeqw xmm7, xmm7 + psrlw xmm4, BYTE_BIT + psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4, xmm7 + paddw xmm5, xmm7 + paddw xmm0, xmm7 + paddw xmm1, xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2, xmm4 ; xmm2=CbE + movdqa xmm3, xmm5 ; xmm3=CbO + paddw xmm4, xmm4 ; xmm4=2*CbE + paddw xmm5, xmm5 ; xmm5=2*CbO + movdqa xmm6, xmm0 ; xmm6=CrE + movdqa xmm7, xmm1 ; xmm7=CrO + paddw xmm0, xmm0 ; xmm0=2*CrE + paddw xmm1, xmm1 ; xmm1=2*CrO + + pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5, [rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1, [rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4, [rel PW_ONE] + paddw xmm5, [rel PW_ONE] + psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0, [rel PW_ONE] + paddw xmm1, [rel PW_ONE] + psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4, xmm2 + paddw xmm5, xmm3 + paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4, xmm2 + movdqa xmm5, xmm3 + punpcklwd xmm2, xmm6 + punpckhwd xmm4, xmm6 + pmaddwd xmm2, [rel PW_MF0344_F0285] + pmaddwd xmm4, [rel PW_MF0344_F0285] + punpcklwd xmm3, xmm7 + punpckhwd xmm5, xmm7 + pmaddwd xmm3, [rel PW_MF0344_F0285] + pmaddwd xmm5, [rel PW_MF0344_F0285] + + paddd xmm2, [rel PD_ONEHALF] + paddd xmm4, [rel PD_ONEHALF] + psrad xmm2, SCALEBITS + psrad xmm4, SCALEBITS + paddd xmm3, [rel PD_ONEHALF] + paddd xmm5, [rel PD_ONEHALF] + psrad xmm3, SCALEBITS + psrad xmm5, SCALEBITS + + packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4, xmm4 + psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG, xmmA + movdqa xmmH, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC, xmmD + movdqa xmmB, xmmD + punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF, xmmE + punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB, xmmE + punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB, xmmF + punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmF + sub rcx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + movd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov word [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + mov byte [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG, xmmB + punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH, xmmC + punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + cmp rcx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmC + movdqa xmmD, xmmH + sub rcx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub rcx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq MMWORD [rdi], xmmA + add rdi, byte SIZEOF_XMMWORD/8*4 + sub rcx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + movd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.nextrow: + pop rcx + pop rsi + pop rbx + pop rdx + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + add rdi, byte SIZEOF_JSAMPROW ; output_buf + dec rax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + pop rbx + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jdcolor-avx2.asm b/media/libjpeg/simd/x86_64/jdcolor-avx2.asm new file mode 100644 index 0000000000..43de9db04d --- /dev/null +++ b/media/libjpeg/simd/x86_64/jdcolor-avx2.asm @@ -0,0 +1,118 @@ +; +; jdcolor.asm - colorspace conversion (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_ycc_rgb_convert_avx2) + +EXTN(jconst_ycc_rgb_convert_avx2): + +PW_F0402 times 16 dw F_0_402 +PW_MF0228 times 16 dw -F_0_228 +PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 +PW_ONE times 16 dw 1 +PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2 +%include "jdcolext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2 +%include "jdcolext-avx2.asm" diff --git a/media/libjpeg/simd/x86_64/jdcolor-sse2.asm b/media/libjpeg/simd/x86_64/jdcolor-sse2.asm new file mode 100644 index 0000000000..b3f1fec07e --- /dev/null +++ b/media/libjpeg/simd/x86_64/jdcolor-sse2.asm @@ -0,0 +1,117 @@ +; +; jdcolor.asm - colorspace conversion (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_ycc_rgb_convert_sse2) + +EXTN(jconst_ycc_rgb_convert_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2 +%include "jdcolext-sse2.asm" diff --git a/media/libjpeg/simd/x86_64/jdmerge-avx2.asm b/media/libjpeg/simd/x86_64/jdmerge-avx2.asm new file mode 100644 index 0000000000..9515a17013 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jdmerge-avx2.asm @@ -0,0 +1,136 @@ +; +; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_merged_upsample_avx2) + +EXTN(jconst_merged_upsample_avx2): + +PW_F0402 times 16 dw F_0_402 +PW_MF0228 times 16 dw -F_0_228 +PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 +PW_ONE times 16 dw 1 +PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extrgb_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extrgb_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extrgbx_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extrgbx_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extbgr_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extbgr_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extbgrx_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extbgrx_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extxbgr_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extxbgr_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_avx2 \ + jsimd_h2v1_extxrgb_merged_upsample_avx2 +%define jsimd_h2v2_merged_upsample_avx2 \ + jsimd_h2v2_extxrgb_merged_upsample_avx2 +%include "jdmrgext-avx2.asm" diff --git a/media/libjpeg/simd/x86_64/jdmerge-sse2.asm b/media/libjpeg/simd/x86_64/jdmerge-sse2.asm new file mode 100644 index 0000000000..aedccc20f6 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jdmerge-sse2.asm @@ -0,0 +1,135 @@ +; +; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_merged_upsample_sse2) + +EXTN(jconst_merged_upsample_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extrgb_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extrgbx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extrgbx_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extbgr_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extbgrx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extbgrx_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extxbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extxbgr_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 \ + jsimd_h2v1_extxrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 \ + jsimd_h2v2_extxrgb_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" diff --git a/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm b/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm new file mode 100644 index 0000000000..bb733c587a --- /dev/null +++ b/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm @@ -0,0 +1,593 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2) +; +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +; r10d = JDIMENSION output_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION in_row_group_ctr +; r13 = JSAMPARRAY output_buf + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define WK_NUM 3 + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2) + +EXTN(jsimd_h2v1_merged_upsample_avx2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 4 + push rbx + + mov ecx, r10d ; col + test rcx, rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 + mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 + mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr + + pop rcx ; col + +.columnloop: + + vmovdqu ymm6, YMMWORD [rbx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) + vmovdqu ymm7, YMMWORD [rdx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) + vpcmpeqw ymm3, ymm3, ymm3 + vpsllw ymm3, ymm3, 7 ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpermq ymm6, ymm6, 0xd8 ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV) + vpermq ymm7, ymm7, 0xd8 ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV) + vpunpcklbw ymm4, ymm6, ymm1 ; ymm4=Cb(0123456789ABCDEF)=CbL + vpunpckhbw ymm6, ymm6, ymm1 ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH + vpunpcklbw ymm0, ymm7, ymm1 ; ymm0=Cr(0123456789ABCDEF)=CrL + vpunpckhbw ymm7, ymm7, ymm1 ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH + + vpaddw ymm5, ymm6, ymm3 + vpaddw ymm2, ymm4, ymm3 + vpaddw ymm1, ymm7, ymm3 + vpaddw ymm3, ymm0, ymm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + vpaddw ymm6, ymm5, ymm5 ; ymm6=2*CbH + vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbL + vpaddw ymm7, ymm1, ymm1 ; ymm7=2*CrH + vpaddw ymm0, ymm3, ymm3 ; ymm0=2*CrL + + vpmulhw ymm6, ymm6, [rel PW_MF0228] ; ymm6=(2*CbH * -FIX(0.22800)) + vpmulhw ymm4, ymm4, [rel PW_MF0228] ; ymm4=(2*CbL * -FIX(0.22800)) + vpmulhw ymm7, ymm7, [rel PW_F0402] ; ymm7=(2*CrH * FIX(0.40200)) + vpmulhw ymm0, ymm0, [rel PW_F0402] ; ymm0=(2*CrL * FIX(0.40200)) + + vpaddw ymm6, ymm6, [rel PW_ONE] + vpaddw ymm4, ymm4, [rel PW_ONE] + vpsraw ymm6, ymm6, 1 ; ymm6=(CbH * -FIX(0.22800)) + vpsraw ymm4, ymm4, 1 ; ymm4=(CbL * -FIX(0.22800)) + vpaddw ymm7, ymm7, [rel PW_ONE] + vpaddw ymm0, ymm0, [rel PW_ONE] + vpsraw ymm7, ymm7, 1 ; ymm7=(CrH * FIX(0.40200)) + vpsraw ymm0, ymm0, 1 ; ymm0=(CrL * FIX(0.40200)) + + vpaddw ymm6, ymm6, ymm5 + vpaddw ymm4, ymm4, ymm2 + vpaddw ymm6, ymm6, ymm5 ; ymm6=(CbH * FIX(1.77200))=(B-Y)H + vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbL * FIX(1.77200))=(B-Y)L + vpaddw ymm7, ymm7, ymm1 ; ymm7=(CrH * FIX(1.40200))=(R-Y)H + vpaddw ymm0, ymm0, ymm3 ; ymm0=(CrL * FIX(1.40200))=(R-Y)L + + vmovdqa YMMWORD [wk(0)], ymm6 ; wk(0)=(B-Y)H + vmovdqa YMMWORD [wk(1)], ymm7 ; wk(1)=(R-Y)H + + vpunpckhwd ymm6, ymm5, ymm1 + vpunpcklwd ymm5, ymm5, ymm1 + vpmaddwd ymm5, ymm5, [rel PW_MF0344_F0285] + vpmaddwd ymm6, ymm6, [rel PW_MF0344_F0285] + vpunpckhwd ymm7, ymm2, ymm3 + vpunpcklwd ymm2, ymm2, ymm3 + vpmaddwd ymm2, ymm2, [rel PW_MF0344_F0285] + vpmaddwd ymm7, ymm7, [rel PW_MF0344_F0285] + + vpaddd ymm5, ymm5, [rel PD_ONEHALF] + vpaddd ymm6, ymm6, [rel PD_ONEHALF] + vpsrad ymm5, ymm5, SCALEBITS + vpsrad ymm6, ymm6, SCALEBITS + vpaddd ymm2, ymm2, [rel PD_ONEHALF] + vpaddd ymm7, ymm7, [rel PD_ONEHALF] + vpsrad ymm2, ymm2, SCALEBITS + vpsrad ymm7, ymm7, SCALEBITS + + vpackssdw ymm5, ymm5, ymm6 ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + vpackssdw ymm2, ymm2, ymm7 ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + vpsubw ymm5, ymm5, ymm1 ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + vpsubw ymm2, ymm2, ymm3 ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + vmovdqa YMMWORD [wk(2)], ymm5 ; wk(2)=(G-Y)H + + mov al, 2 ; Yctr + jmp short .Yloop_1st + +.Yloop_2nd: + vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H + vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H + vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H + +.Yloop_1st: + vmovdqu ymm7, YMMWORD [rsi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV) + + vpcmpeqw ymm6, ymm6, ymm6 + vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} + vpand ymm6, ymm6, ymm7 ; ymm6=Y(02468ACEGIKMOQSU)=YE + vpsrlw ymm7, ymm7, BYTE_BIT ; ymm7=Y(13579BDFHJLNPRTV)=YO + + vmovdqa ymm1, ymm0 ; ymm1=ymm0=(R-Y)(L/H) + vmovdqa ymm3, ymm2 ; ymm3=ymm2=(G-Y)(L/H) + vmovdqa ymm5, ymm4 ; ymm5=ymm4=(B-Y)(L/H) + + vpaddw ymm0, ymm0, ymm6 ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU) + vpaddw ymm1, ymm1, ymm7 ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV) + vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********) + vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********) + + vpaddw ymm2, ymm2, ymm6 ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU) + vpaddw ymm3, ymm3, ymm7 ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV) + vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********) + vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********) + + vpaddw ymm4, ymm4, ymm6 ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU) + vpaddw ymm5, ymm5, ymm7 ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV) + vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********) + vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F + ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V) + vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F + ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V) + + vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G + ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --) + vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F + ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07 + ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N) + + vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H + ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --) + + vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H + ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --) + vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G + ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --) + vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18 + ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O) + + vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H + ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --) + vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29 + ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P) + + vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03 + ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J) + vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14 + ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K) + vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29 + ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P) + vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07 + ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N) + + vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B + ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R) + vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C + ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S) + vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H + ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --) + vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V) + + vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A + ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) + vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 + ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F + ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) + vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q + ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) + + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test rdi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD + jz near .endcolumn + + add rsi, byte SIZEOF_YMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add rbx, byte SIZEOF_YMMWORD ; inptr1 + add rdx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + +.column_st64: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_YMMWORD + jb short .column_st32 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmF + sub rcx, byte 2*SIZEOF_YMMWORD + jmp short .column_st31 +.column_st32: + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st31 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + add rdi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub rcx, byte SIZEOF_YMMWORD + jmp short .column_st31 +.column_st31: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + vmovq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + vpsrldq xmmA, xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + vmovd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + vpsrldq xmmA, xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + vmovd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov word [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + mov byte [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%else + vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) + vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) +%endif + ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) + ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) + ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) + ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) + ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) + ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) + ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **) + ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **) + + vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) + vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E + ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U) + vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F + ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V) + vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F + ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V) + + vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E + ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U) + vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36 + ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M) + vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F + ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V) + vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37 + ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N) + + vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) + vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) + + vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B + ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J + ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) + vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R + ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) + + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test rdi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC + vmovntdq YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC + vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH +.out0: + add rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD + jz near .endcolumn + + add rsi, byte SIZEOF_YMMWORD ; inptr0 + dec al + jnz near .Yloop_2nd + + add rbx, byte SIZEOF_YMMWORD ; inptr1 + add rdx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + +.column_st64: + cmp rcx, byte SIZEOF_YMMWORD/2 + jb short .column_st32 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmC + vmovdqa ymmD, ymmH + sub rcx, byte SIZEOF_YMMWORD/2 +.column_st32: + cmp rcx, byte SIZEOF_YMMWORD/4 + jb short .column_st16 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + add rdi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub rcx, byte SIZEOF_YMMWORD/4 +.column_st16: + cmp rcx, byte SIZEOF_YMMWORD/8 + jb short .column_st15 + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub rcx, byte SIZEOF_YMMWORD/8 +.column_st15: + ; Store two pixels (8 bytes) of ymmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_YMMWORD/16 + jb short .column_st7 + vmovq MMWORD [rdi], xmmA + add rdi, byte SIZEOF_YMMWORD/16*4 + sub rcx, byte SIZEOF_YMMWORD/16 + vpsrldq xmmA, SIZEOF_YMMWORD/16*4 +.column_st7: + ; Store one pixel (4 bytes) of ymmA to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + vmovd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + sfence ; flush the write buffer + +.return: + pop rbx + vzeroupper + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +; r10d = JDIMENSION output_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION in_row_group_ctr +; r13 = JSAMPARRAY output_buf + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2) + +EXTN(jsimd_h2v2_merged_upsample_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + push rbx + + mov eax, r10d + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + + push rdx ; inptr2 + push rbx ; inptr1 + push rsi ; inptr00 + mov rbx, rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_avx2) + + pop rax + pop rcx + pop rdi + pop rsi + pop rbx + pop rdx + + add rdi, byte SIZEOF_JSAMPROW ; outptr1 + add rsi, byte SIZEOF_JSAMPROW ; inptr01 + + push rdx ; inptr2 + push rbx ; inptr1 + push rsi ; inptr00 + mov rbx, rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_avx2) + + pop rax + pop rcx + pop rdi + pop rsi + pop rbx + pop rdx + + pop rbx + uncollect_args 4 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm b/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm new file mode 100644 index 0000000000..b176a4cd4f --- /dev/null +++ b/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm @@ -0,0 +1,535 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2) +; +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +; r10d = JDIMENSION output_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION in_row_group_ctr +; r13 = JSAMPARRAY output_buf + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 3 + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2) + +EXTN(jsimd_h2v1_merged_upsample_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 4 + push rbx + + mov ecx, r10d ; col + test rcx, rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 + mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 + mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr + + pop rcx ; col + +.columnloop: + + movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) + movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) + + pxor xmm1, xmm1 ; xmm1=(all 0's) + pcmpeqw xmm3, xmm3 + psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + movdqa xmm4, xmm6 + punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH + punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL + movdqa xmm0, xmm7 + punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH + punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL + + paddw xmm6, xmm3 + paddw xmm4, xmm3 + paddw xmm7, xmm3 + paddw xmm0, xmm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm5, xmm6 ; xmm5=CbH + movdqa xmm2, xmm4 ; xmm2=CbL + paddw xmm6, xmm6 ; xmm6=2*CbH + paddw xmm4, xmm4 ; xmm4=2*CbL + movdqa xmm1, xmm7 ; xmm1=CrH + movdqa xmm3, xmm0 ; xmm3=CrL + paddw xmm7, xmm7 ; xmm7=2*CrH + paddw xmm0, xmm0 ; xmm0=2*CrL + + pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) + pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) + pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) + pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) + + paddw xmm6, [rel PW_ONE] + paddw xmm4, [rel PW_ONE] + psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800)) + psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800)) + paddw xmm7, [rel PW_ONE] + paddw xmm0, [rel PW_ONE] + psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200)) + psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200)) + + paddw xmm6, xmm5 + paddw xmm4, xmm2 + paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H + paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L + paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H + paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L + + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H + + movdqa xmm6, xmm5 + movdqa xmm7, xmm2 + punpcklwd xmm5, xmm1 + punpckhwd xmm6, xmm1 + pmaddwd xmm5, [rel PW_MF0344_F0285] + pmaddwd xmm6, [rel PW_MF0344_F0285] + punpcklwd xmm2, xmm3 + punpckhwd xmm7, xmm3 + pmaddwd xmm2, [rel PW_MF0344_F0285] + pmaddwd xmm7, [rel PW_MF0344_F0285] + + paddd xmm5, [rel PD_ONEHALF] + paddd xmm6, [rel PD_ONEHALF] + psrad xmm5, SCALEBITS + psrad xmm6, SCALEBITS + paddd xmm2, [rel PD_ONEHALF] + paddd xmm7, [rel PD_ONEHALF] + psrad xmm2, SCALEBITS + psrad xmm7, SCALEBITS + + packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H + + mov al, 2 ; Yctr + jmp short .Yloop_1st + +.Yloop_2nd: + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H + +.Yloop_1st: + movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) + + pcmpeqw xmm6, xmm6 + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE + psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO + + movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H) + movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H) + movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H) + + paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) + paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) + packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) + paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) + packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) + paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) + packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG, xmmA + movdqa xmmH, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC, xmmD + movdqa xmmB, xmmD + punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF, xmmE + punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB, xmmE + punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB, xmmF + punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmF + sub rcx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + movd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov word [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + mov byte [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG, xmmB + punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH, xmmC + punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + cmp rcx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmC + movdqa xmmD, xmmH + sub rcx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub rcx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_XMMWORD/8*4 + sub rcx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + movd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + sfence ; flush the write buffer + +.return: + pop rbx + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +; r10d = JDIMENSION output_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION in_row_group_ctr +; r13 = JSAMPARRAY output_buf + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2) + +EXTN(jsimd_h2v2_merged_upsample_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + push rbx + + mov eax, r10d + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + + push rdx ; inptr2 + push rbx ; inptr1 + push rsi ; inptr00 + mov rbx, rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_sse2) + + pop rax + pop rcx + pop rdi + pop rsi + pop rbx + pop rdx + + add rdi, byte SIZEOF_JSAMPROW ; outptr1 + add rsi, byte SIZEOF_JSAMPROW ; inptr01 + + push rdx ; inptr2 + push rbx ; inptr1 + push rsi ; inptr00 + mov rbx, rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_sse2) + + pop rax + pop rcx + pop rdi + pop rsi + pop rbx + pop rdx + + pop rbx + uncollect_args 4 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jdsample-avx2.asm b/media/libjpeg/simd/x86_64/jdsample-avx2.asm new file mode 100644 index 0000000000..fc274a95ea --- /dev/null +++ b/media/libjpeg/simd/x86_64/jdsample-avx2.asm @@ -0,0 +1,695 @@ +; +; jdsample.asm - upsampling (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fancy_upsample_avx2) + +EXTN(jconst_fancy_upsample_avx2): + +PW_ONE times 16 dw 1 +PW_TWO times 16 dw 2 +PW_THREE times 16 dw 3 +PW_SEVEN times 16 dw 7 +PW_EIGHT times 16 dw 8 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION downsampled_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2) + +EXTN(jsimd_h2v1_fancy_upsample_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + push_xmm 3 + collect_args 4 + + mov eax, r11d ; colctr + test rax, rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data + + vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) + vpcmpeqb xmm9, xmm9, xmm9 + vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff + + vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-1) + vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ff) MSB is ff + +.rowloop: + push rax ; colctr + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + + test rax, SIZEOF_YMMWORD-1 + jz short .skip + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + vpand ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD] + + add rax, byte SIZEOF_YMMWORD-1 + and rax, byte -SIZEOF_YMMWORD + cmp rax, byte SIZEOF_YMMWORD + ja short .columnloop + +.columnloop_last: + vpand ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD] + jmp short .upsample + +.columnloop: + vmovdqu ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD] + vperm2i128 ymm6, ymm0, ymm6, 0x20 + vpslldq ymm6, ymm6, 15 + +.upsample: + vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31) + + vperm2i128 ymm2, ymm0, ymm1, 0x20 + vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30) + vperm2i128 ymm4, ymm0, ymm1, 0x03 + vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --) + + vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30) + vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32) + + vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --) + + vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30) + vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22) + vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24) + vpunpcklbw ymm8, ymm3, ymm0 ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32) + vperm2i128 ymm3, ymm8, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) + vperm2i128 ymm6, ymm8, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) + + vpmullw ymm1, ymm1, [rel PW_THREE] + vpmullw ymm4, ymm4, [rel PW_THREE] + vpaddw ymm2, ymm2, [rel PW_ONE] + vpaddw ymm5, ymm5, [rel PW_ONE] + vpaddw ymm3, ymm3, [rel PW_TWO] + vpaddw ymm6, ymm6, [rel PW_TWO] + + vpaddw ymm2, ymm2, ymm1 + vpaddw ymm5, ymm5, ymm4 + vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) + vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) + vpaddw ymm3, ymm3, ymm1 + vpaddw ymm6, ymm6, ymm4 + vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) + vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) + + vpsllw ymm3, ymm3, BYTE_BIT + vpsllw ymm6, ymm6, BYTE_BIT + vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31) + vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63) + + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2 + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5 + + sub rax, byte SIZEOF_YMMWORD + add rsi, byte 1*SIZEOF_YMMWORD ; inptr + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + cmp rax, byte SIZEOF_YMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg near .rowloop + +.return: + vzeroupper + uncollect_args 4 + pop_xmm 3 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION downsampled_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define WK_NUM 4 + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2) + +EXTN(jsimd_h2v2_fancy_upsample_avx2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + push_xmm 3 + collect_args 4 + push rbx + + mov eax, r11d ; colctr + test rax, rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rax ; colctr + push rcx + push rdi + push rsi + + mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + + vpxor ymm8, ymm8, ymm8 ; ymm8=(all 0's) + vpcmpeqb xmm9, xmm9, xmm9 + vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff + vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-2) + vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff + + test rax, SIZEOF_YMMWORD-1 + jz short .skip + push rdx + mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop rdx +.skip: + ; -- process the first column block + + vmovdqu ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0] + vmovdqu ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0] + vmovdqu ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0] + + vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm3, ymm2, ymm8 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpmullw ymm0, ymm0, [rel PW_THREE] + vpmullw ymm4, ymm4, [rel PW_THREE] + + vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save + vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2 + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6 + + vpand ymm1, ymm1, ymm10 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vpand ymm2, ymm2, ymm10 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + + vmovdqa YMMWORD [wk(0)], ymm1 + vmovdqa YMMWORD [wk(1)], ymm2 + + add rax, byte SIZEOF_YMMWORD-1 + and rax, byte -SIZEOF_YMMWORD + cmp rax, byte SIZEOF_YMMWORD + ja short .columnloop + +.columnloop_last: + ; -- process the last column block + + vpand ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD] + vpand ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD] + + vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) + vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) + + jmp near .upsample + +.columnloop: + ; -- process the next column block + + vmovdqu ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1] + vmovdqu ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1] + vmovdqu ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1] + + vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) + vpunpcklbw ymm7, ymm2, ymm8 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) + vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vpmullw ymm0, ymm0, [rel PW_THREE] + vpmullw ymm4, ymm4, [rel PW_THREE] + + vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vmovdqu YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save + vmovdqu YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data + vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2 + vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6 + + vperm2i128 ymm1, ymm8, ymm1, 0x20 + vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) + vperm2i128 ymm2, ymm8, ymm2, 0x20 + vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) + + vmovdqa YMMWORD [wk(2)], ymm1 + vmovdqa YMMWORD [wk(3)], ymm2 + +.upsample: + ; -- process the upper row + + vmovdqu ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vmovdqu ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vperm2i128 ymm0, ymm8, ymm7, 0x03 + vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) + vperm2i128 ymm4, ymm8, ymm3, 0x20 + vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) + + vperm2i128 ymm5, ymm8, ymm7, 0x03 + vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm6, ymm8, ymm3, 0x20 + vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) + vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vperm2i128 ymm2, ymm8, ymm3, 0x03 + vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) + vperm2i128 ymm4, ymm8, ymm3, 0x03 + vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm1, ymm8, ymm7, 0x20 + vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + + vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) + + vmovdqa YMMWORD [wk(0)], ymm4 + + vpmullw ymm7, ymm7, [rel PW_THREE] + vpmullw ymm3, ymm3, [rel PW_THREE] + vpaddw ymm1, ymm1, [rel PW_EIGHT] + vpaddw ymm5, ymm5, [rel PW_EIGHT] + vpaddw ymm0, ymm0, [rel PW_SEVEN] + vpaddw ymm2, [rel PW_SEVEN] + + vpaddw ymm1, ymm1, ymm7 + vpaddw ymm5, ymm5, ymm3 + vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) + vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm2, ymm2, ymm3 + vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) + vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) + + vpsllw ymm0, ymm0, BYTE_BIT + vpsllw ymm2, ymm2, BYTE_BIT + vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31) + vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63) + + vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 + vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 + + ; -- process the lower row + + vmovdqu ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + vmovdqu ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) + + vperm2i128 ymm7, ymm8, ymm6, 0x03 + vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) + vperm2i128 ymm3, ymm8, ymm4, 0x20 + vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) + + vperm2i128 ymm0, ymm8, ymm6, 0x03 + vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm2, ymm8, ymm4, 0x20 + vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) + vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) + + vperm2i128 ymm5, ymm8, ymm4, 0x03 + vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) + vperm2i128 ymm3, ymm8, ymm4, 0x03 + vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) + vperm2i128 ymm1, ymm8, ymm6, 0x20 + vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + + vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) + vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) + + vmovdqa YMMWORD [wk(1)], ymm3 + + vpmullw ymm6, ymm6, [rel PW_THREE] + vpmullw ymm4, ymm4, [rel PW_THREE] + vpaddw ymm1, ymm1, [rel PW_EIGHT] + vpaddw ymm0, ymm0, [rel PW_EIGHT] + vpaddw ymm7, ymm7, [rel PW_SEVEN] + vpaddw ymm5, ymm5, [rel PW_SEVEN] + + vpaddw ymm1, ymm1, ymm6 + vpaddw ymm0, ymm0, ymm4 + vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) + vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) + vpaddw ymm7, ymm7, ymm6 + vpaddw ymm5, ymm5, ymm4 + vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) + vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) + + vpsllw ymm7, ymm7, BYTE_BIT + vpsllw ymm5, ymm5, BYTE_BIT + vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31) + vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63) + + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1 + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0 + + sub rax, byte SIZEOF_YMMWORD + add rcx, byte 1*SIZEOF_YMMWORD ; inptr1(above) + add rbx, byte 1*SIZEOF_YMMWORD ; inptr0 + add rsi, byte 1*SIZEOF_YMMWORD ; inptr1(below) + add rdx, byte 2*SIZEOF_YMMWORD ; outptr0 + add rdi, byte 2*SIZEOF_YMMWORD ; outptr1 + cmp rax, byte SIZEOF_YMMWORD + ja near .columnloop + test rax, rax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rcx + pop rax + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop rbx + vzeroupper + uncollect_args 4 + pop_xmm 3 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION output_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2) + +EXTN(jsimd_h2v1_upsample_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + + mov edx, r11d + add rdx, byte (SIZEOF_YMMWORD-1) + and rdx, -SIZEOF_YMMWORD + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz short .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + mov rax, rdx ; colctr +.columnloop: + + cmp rax, byte SIZEOF_YMMWORD + ja near .above_16 + + vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD] + vpunpckhbw xmm1, xmm0, xmm0 + vpunpcklbw xmm0, xmm0, xmm0 + + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + + jmp short .nextrow + +.above_16: + vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] + + vpermq ymm0, ymm0, 0xd8 + vpunpckhbw ymm1, ymm0, ymm0 + vpunpcklbw ymm0, ymm0, ymm0 + + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1 + + sub rax, byte 2*SIZEOF_YMMWORD + jz short .nextrow + + add rsi, byte SIZEOF_YMMWORD ; inptr + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + jmp short .columnloop + +.nextrow: + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg short .rowloop + +.return: + vzeroupper + uncollect_args 4 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION output_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2) + +EXTN(jsimd_h2v2_upsample_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + push rbx + + mov edx, r11d + add rdx, byte (SIZEOF_YMMWORD-1) + and rdx, -SIZEOF_YMMWORD + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + mov rax, rdx ; colctr +.columnloop: + + cmp rax, byte SIZEOF_YMMWORD + ja short .above_16 + + vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + vpunpckhbw xmm1, xmm0, xmm0 + vpunpcklbw xmm0, xmm0, xmm0 + + vmovdqu XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 + vmovdqu XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + + jmp near .nextrow + +.above_16: + vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] + + vpermq ymm0, ymm0, 0xd8 + vpunpckhbw ymm1, ymm0, ymm0 + vpunpcklbw ymm0, ymm0, ymm0 + + vmovdqu YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0 + vmovdqu YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1 + + sub rax, byte 2*SIZEOF_YMMWORD + jz short .nextrow + + add rsi, byte SIZEOF_YMMWORD ; inptr + add rbx, 2*SIZEOF_YMMWORD ; outptr0 + add rdi, 2*SIZEOF_YMMWORD ; outptr1 + jmp short .columnloop + +.nextrow: + pop rsi + pop rdi + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop rbx + vzeroupper + uncollect_args 4 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jdsample-sse2.asm b/media/libjpeg/simd/x86_64/jdsample-sse2.asm new file mode 100644 index 0000000000..20e07670e9 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jdsample-sse2.asm @@ -0,0 +1,664 @@ +; +; jdsample.asm - upsampling (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fancy_upsample_sse2) + +EXTN(jconst_fancy_upsample_sse2): + +PW_ONE times 8 dw 1 +PW_TWO times 8 dw 2 +PW_THREE times 8 dw 3 +PW_SEVEN times 8 dw 7 +PW_EIGHT times 8 dw 8 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION downsampled_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2) + +EXTN(jsimd_h2v1_fancy_upsample_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + + mov eax, r11d ; colctr + test rax, rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rax ; colctr + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + + test rax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + pxor xmm0, xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7, xmm7 + psrldq xmm7, (SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] + + add rax, byte SIZEOF_XMMWORD-1 + and rax, byte -SIZEOF_XMMWORD + cmp rax, byte SIZEOF_XMMWORD + ja short .columnloop + +.columnloop_last: + pcmpeqb xmm6, xmm6 + pslldq xmm6, (SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] + jmp short .upsample + +.columnloop: + movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] + pslldq xmm6, (SIZEOF_XMMWORD-1) + +.upsample: + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm2, xmm1 + movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7, xmm1 + psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4, xmm1 + punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm2 + punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6, xmm3 + punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1, [rel PW_THREE] + pmullw xmm4, [rel PW_THREE] + paddw xmm2, [rel PW_ONE] + paddw xmm5, [rel PW_ONE] + paddw xmm3, [rel PW_TWO] + paddw xmm6, [rel PW_TWO] + + paddw xmm2, xmm1 + paddw xmm5, xmm4 + psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3, xmm1 + paddw xmm6, xmm4 + psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3, BYTE_BIT + psllw xmm6, BYTE_BIT + por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 + + sub rax, byte SIZEOF_XMMWORD + add rsi, byte 1*SIZEOF_XMMWORD ; inptr + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + cmp rax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg near .rowloop + +.return: + uncollect_args 4 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION downsampled_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 4 + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2) + +EXTN(jsimd_h2v2_fancy_upsample_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 4 + push rbx + + mov eax, r11d ; colctr + test rax, rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rax ; colctr + push rcx + push rdi + push rsi + + mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + + test rax, SIZEOF_XMMWORD-1 + jz short .skip + push rdx + mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop rdx +.skip: + ; -- process the first column block + + movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + + pxor xmm3, xmm3 ; xmm3=(all 0's) + movdqa xmm4, xmm0 + punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0, [rel PW_THREE] + pmullw xmm4, [rel PW_THREE] + + pcmpeqb xmm7, xmm7 + psrldq xmm7, (SIZEOF_XMMWORD-2) + + paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 + + pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 + + add rax, byte SIZEOF_XMMWORD-1 + and rax, byte -SIZEOF_XMMWORD + cmp rax, byte SIZEOF_XMMWORD + ja short .columnloop + +.columnloop_last: + ; -- process the last column block + + pcmpeqb xmm1, xmm1 + pslldq xmm1, (SIZEOF_XMMWORD-2) + movdqa xmm2, xmm1 + + pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] + + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + + jmp near .upsample + +.columnloop: + ; -- process the next column block + + movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + + pxor xmm3, xmm3 ; xmm3=(all 0's) + movdqa xmm4, xmm0 + punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0, [rel PW_THREE] + pmullw xmm4, [rel PW_THREE] + + paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 + + pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 + +.upsample: + ; -- process the upper row + + movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] + + movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5, xmm7 + movdqa xmm6, xmm3 + psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1, xmm7 + movdqa xmm2, xmm3 + pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4, xmm3 + psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7, [rel PW_THREE] + pmullw xmm3, [rel PW_THREE] + paddw xmm1, [rel PW_EIGHT] + paddw xmm5, [rel PW_EIGHT] + paddw xmm0, [rel PW_SEVEN] + paddw xmm2, [rel PW_SEVEN] + + paddw xmm1, xmm7 + paddw xmm5, xmm3 + psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0, xmm7 + paddw xmm2, xmm3 + psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0, BYTE_BIT + psllw xmm2, BYTE_BIT + por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] + + movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0, xmm6 + movdqa xmm2, xmm4 + psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1, xmm6 + movdqa xmm5, xmm4 + pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3, xmm4 + psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6, [rel PW_THREE] + pmullw xmm4, [rel PW_THREE] + paddw xmm1, [rel PW_EIGHT] + paddw xmm0, [rel PW_EIGHT] + paddw xmm7, [rel PW_SEVEN] + paddw xmm5, [rel PW_SEVEN] + + paddw xmm1, xmm6 + paddw xmm0, xmm4 + psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7, xmm6 + paddw xmm5, xmm4 + psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7, BYTE_BIT + psllw xmm5, BYTE_BIT + por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 + + sub rax, byte SIZEOF_XMMWORD + add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 + add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 + add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp rax, byte SIZEOF_XMMWORD + ja near .columnloop + test rax, rax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rcx + pop rax + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop rbx + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION output_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2) + +EXTN(jsimd_h2v1_upsample_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + + mov edx, r11d + add rdx, byte (2*SIZEOF_XMMWORD)-1 + and rdx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz short .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + mov rax, rdx ; colctr +.columnloop: + + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] + + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm2 + punpckhbw xmm3, xmm3 + + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rdi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop + +.nextrow: + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg short .rowloop + +.return: + uncollect_args 4 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, +; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11d = JDIMENSION output_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 32 + GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2) + +EXTN(jsimd_h2v2_upsample_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + push rbx + + mov edx, r11d + add rdx, byte (2*SIZEOF_XMMWORD)-1 + and rdx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + mov rax, rdx ; colctr +.columnloop: + + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + + movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] + + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm2 + punpckhbw xmm3, xmm3 + + movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 + add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop + +.nextrow: + pop rsi + pop rdi + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop rbx + uncollect_args 4 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jfdctflt-sse.asm b/media/libjpeg/simd/x86_64/jfdctflt-sse.asm new file mode 100644 index 0000000000..ef2796649b --- /dev/null +++ b/media/libjpeg/simd/x86_64/jfdctflt-sse.asm @@ -0,0 +1,355 @@ +; +; jfdctflt.asm - floating-point FDCT (64-bit SSE) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the forward DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_float_sse) + +EXTN(jconst_fdct_float_sse): + +PD_0_382 times 4 dd 0.382683432365089771728460 +PD_0_707 times 4 dd 0.707106781186547524400844 +PD_0_541 times 4 dd 0.541196100146196984399723 +PD_1_306 times 4 dd 1.306562964876376527856643 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_float_sse(FAST_FLOAT *data) +; + +; r10 = FAST_FLOAT *data + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_float_sse) + +EXTN(jsimd_fdct_float_sse): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 1 + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (FAST_FLOAT *) + mov rcx, DCTSIZE/4 +.rowloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) + ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) + + movaps xmm4, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31) + unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33) + movaps xmm5, xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35) + unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) + ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) + + movaps xmm4, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13) + movaps xmm2, xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15) + unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17) + + movaps xmm7, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0 + unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1 + movaps xmm3, xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6 + unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7 + + movaps xmm0, xmm7 + movaps xmm5, xmm6 + subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2 + unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3 + movaps xmm6, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4 + unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5 + + movaps xmm2, xmm7 + movaps xmm3, xmm4 + addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1, xmm5 + movaps xmm6, xmm0 + subps xmm5, xmm7 ; xmm5=tmp13 + subps xmm0, xmm4 ; xmm0=tmp12 + addps xmm1, xmm7 ; xmm1=tmp10 + addps xmm6, xmm4 ; xmm6=tmp11 + + addps xmm0, xmm5 + mulps xmm0, [rel PD_0_707] ; xmm0=z1 + + movaps xmm7, xmm1 + movaps xmm4, xmm5 + subps xmm1, xmm6 ; xmm1=data4 + subps xmm5, xmm0 ; xmm5=data6 + addps xmm7, xmm6 ; xmm7=data0 + addps xmm4, xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2, xmm3 ; xmm2=tmp10 + addps xmm3, xmm6 ; xmm3=tmp11 + addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3, [rel PD_0_707] ; xmm3=z3 + + movaps xmm1, xmm2 ; xmm1=tmp10 + subps xmm2, xmm6 + mulps xmm2, [rel PD_0_382] ; xmm2=z5 + mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1, xmm2 ; xmm1=z2 + addps xmm6, xmm2 ; xmm6=z4 + + movaps xmm5, xmm0 + subps xmm0, xmm3 ; xmm0=z13 + addps xmm5, xmm3 ; xmm5=z11 + + movaps xmm7, xmm0 + movaps xmm4, xmm5 + subps xmm0, xmm1 ; xmm0=data3 + subps xmm5, xmm6 ; xmm5=data7 + addps xmm7, xmm1 ; xmm7=data5 + addps xmm4, xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT + dec rcx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov rdx, r10 ; (FAST_FLOAT *) + mov rcx, DCTSIZE/4 +.columnloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) + ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) + + movaps xmm4, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13) + unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33) + movaps xmm5, xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53) + unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) + ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) + + movaps xmm4, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11) + unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31) + movaps xmm2, xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51) + unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71) + + movaps xmm7, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0 + unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1 + movaps xmm3, xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6 + unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7 + + movaps xmm0, xmm7 + movaps xmm5, xmm6 + subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2 + unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3 + movaps xmm6, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4 + unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5 + + movaps xmm2, xmm7 + movaps xmm3, xmm4 + addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1, xmm5 + movaps xmm6, xmm0 + subps xmm5, xmm7 ; xmm5=tmp13 + subps xmm0, xmm4 ; xmm0=tmp12 + addps xmm1, xmm7 ; xmm1=tmp10 + addps xmm6, xmm4 ; xmm6=tmp11 + + addps xmm0, xmm5 + mulps xmm0, [rel PD_0_707] ; xmm0=z1 + + movaps xmm7, xmm1 + movaps xmm4, xmm5 + subps xmm1, xmm6 ; xmm1=data4 + subps xmm5, xmm0 ; xmm5=data6 + addps xmm7, xmm6 ; xmm7=data0 + addps xmm4, xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2, xmm3 ; xmm2=tmp10 + addps xmm3, xmm6 ; xmm3=tmp11 + addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3, [rel PD_0_707] ; xmm3=z3 + + movaps xmm1, xmm2 ; xmm1=tmp10 + subps xmm2, xmm6 + mulps xmm2, [rel PD_0_382] ; xmm2=z5 + mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1, xmm2 ; xmm1=z2 + addps xmm6, xmm2 ; xmm6=z4 + + movaps xmm5, xmm0 + subps xmm0, xmm3 ; xmm0=z13 + addps xmm5, xmm3 ; xmm5=z11 + + movaps xmm7, xmm0 + movaps xmm4, xmm5 + subps xmm0, xmm1 ; xmm0=data3 + subps xmm5, xmm6 ; xmm5=data7 + addps xmm7, xmm1 ; xmm7=data5 + addps xmm4, xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + add rdx, byte 4*SIZEOF_FAST_FLOAT + dec rcx + jnz near .columnloop + + uncollect_args 1 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm b/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm new file mode 100644 index 0000000000..2e1bfe6e8c --- /dev/null +++ b/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm @@ -0,0 +1,389 @@ +; +; jfdctfst.asm - fast integer FDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the forward DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c +; for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. + +%if CONST_BITS == 8 +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 32 + GLOBAL_DATA(jconst_fdct_ifast_sse2) + +EXTN(jconst_fdct_ifast_sse2): + +PW_F0707 times 8 dw F_0_707 << CONST_SHIFT +PW_F0382 times 8 dw F_0_382 << CONST_SHIFT +PW_F0541 times 8 dw F_0_541 << CONST_SHIFT +PW_F1306 times 8 dw F_1_306 << CONST_SHIFT + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_ifast_sse2(DCTELEM *data) +; + +; r10 = DCTELEM *data + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2) + +EXTN(jsimd_fdct_ifast_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 1 + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6, xmm1 + movdqa xmm3, xmm0 + psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2, xmm1 + movdqa xmm5, xmm7 + paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm0, xmm6 + psubw xmm3, xmm1 ; xmm3=tmp13 + psubw xmm6, xmm7 ; xmm6=tmp12 + paddw xmm4, xmm1 ; xmm4=tmp10 + paddw xmm0, xmm7 ; xmm0=tmp11 + + paddw xmm6, xmm3 + psllw xmm6, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6, [rel PW_F0707] ; xmm6=z1 + + movdqa xmm1, xmm4 + movdqa xmm7, xmm3 + psubw xmm4, xmm0 ; xmm4=data4 + psubw xmm3, xmm6 ; xmm3=data6 + paddw xmm1, xmm0 ; xmm1=data0 + paddw xmm7, xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2, xmm5 ; xmm2=tmp10 + paddw xmm5, xmm0 ; xmm5=tmp11 + paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [rel PW_F0707] ; xmm5=z3 + + movdqa xmm4, xmm2 ; xmm4=tmp10 + psubw xmm2, xmm0 + pmulhw xmm2, [rel PW_F0382] ; xmm2=z5 + pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4, xmm2 ; xmm4=z2 + paddw xmm0, xmm2 ; xmm0=z4 + + movdqa xmm3, xmm6 + psubw xmm6, xmm5 ; xmm6=z13 + paddw xmm3, xmm5 ; xmm3=z11 + + movdqa xmm2, xmm6 + movdqa xmm5, xmm3 + psubw xmm6, xmm4 ; xmm6=data3 + psubw xmm3, xmm0 ; xmm3=data7 + paddw xmm2, xmm4 ; xmm2=data5 + paddw xmm5, xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2, xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3, xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5, xmm6 + movdqa xmm3, xmm1 + psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7, xmm6 + movdqa xmm0, xmm2 + paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm1, xmm5 + psubw xmm3, xmm6 ; xmm3=tmp13 + psubw xmm5, xmm2 ; xmm5=tmp12 + paddw xmm4, xmm6 ; xmm4=tmp10 + paddw xmm1, xmm2 ; xmm1=tmp11 + + paddw xmm5, xmm3 + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [rel PW_F0707] ; xmm5=z1 + + movdqa xmm6, xmm4 + movdqa xmm2, xmm3 + psubw xmm4, xmm1 ; xmm4=data4 + psubw xmm3, xmm5 ; xmm3=data6 + paddw xmm6, xmm1 ; xmm6=data0 + paddw xmm2, xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7, xmm0 ; xmm7=tmp10 + paddw xmm0, xmm1 ; xmm0=tmp11 + paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7, PRE_MULTIPLY_SCALE_BITS + psllw xmm1, PRE_MULTIPLY_SCALE_BITS + + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0, [rel PW_F0707] ; xmm0=z3 + + movdqa xmm4, xmm7 ; xmm4=tmp10 + psubw xmm7, xmm1 + pmulhw xmm7, [rel PW_F0382] ; xmm7=z5 + pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4, xmm7 ; xmm4=z2 + paddw xmm1, xmm7 ; xmm1=z4 + + movdqa xmm3, xmm5 + psubw xmm5, xmm0 ; xmm5=z13 + paddw xmm3, xmm0 ; xmm3=z11 + + movdqa xmm6, xmm5 + movdqa xmm2, xmm3 + psubw xmm5, xmm4 ; xmm5=data3 + psubw xmm3, xmm1 ; xmm3=data7 + paddw xmm6, xmm4 ; xmm6=data5 + paddw xmm2, xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 + + uncollect_args 1 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jfdctint-avx2.asm b/media/libjpeg/simd/x86_64/jfdctint-avx2.asm new file mode 100644 index 0000000000..e56258b48a --- /dev/null +++ b/media/libjpeg/simd/x86_64/jfdctint-avx2.asm @@ -0,0 +1,320 @@ +; +; jfdctint.asm - accurate integer FDCT (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit matrix transpose using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers + +%macro dotranspose 8 + ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) + + vpunpcklwd %5, %1, %2 + vpunpckhwd %6, %1, %2 + vpunpcklwd %7, %3, %4 + vpunpckhwd %8, %3, %4 + ; transpose coefficients(phase 1) + ; %5=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53) + ; %6=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57) + ; %7=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73) + ; %8=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77) + + vpunpckldq %1, %5, %7 + vpunpckhdq %2, %5, %7 + vpunpckldq %3, %6, %8 + vpunpckhdq %4, %6, %8 + ; transpose coefficients(phase 2) + ; %1=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71) + ; %2=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73) + ; %3=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75) + ; %4=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77) + + vpermq %1, %1, 0x8D + vpermq %2, %2, 0x8D + vpermq %3, %3, 0xD8 + vpermq %4, %4, 0xD8 + ; transpose coefficients(phase 3) + ; %1=(01 11 21 31 41 51 61 71 00 10 20 30 40 50 60 70) + ; %2=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) + ; %3=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) + ; %4=(06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77) +%endmacro + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers +; %9: Pass (1 or 2) + +%macro dodct 9 + vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7 + vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0 + vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2 + vpsubw %8, %2, %3 ; %8=data3_2-data4_5=tmp4_5 + + ; -- Even part + + vperm2i128 %6, %6, %6, 0x01 ; %6=tmp0_1 + vpaddw %1, %6, %7 ; %1=tmp0_1+tmp3_2=tmp10_11 + vpsubw %6, %6, %7 ; %6=tmp0_1-tmp3_2=tmp13_12 + + vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10 + vpsignw %1, %1, [rel PW_1_NEG1] ; %1=tmp10_neg11 + vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11) +%if %9 == 1 + vpsllw %1, %7, PASS1_BITS ; %1=data0_4 +%else + vpaddw %7, %7, [rel PW_DESCALE_P2X] + vpsraw %1, %7, PASS1_BITS ; %1=data0_4 +%endif + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + vperm2i128 %7, %6, %6, 0x01 ; %7=tmp12_13 + vpunpcklwd %2, %6, %7 + vpunpckhwd %6, %6, %7 + vpmaddwd %2, %2, [rel PW_F130_F054_MF130_F054] ; %2=data2_6L + vpmaddwd %6, %6, [rel PW_F130_F054_MF130_F054] ; %6=data2_6H + + vpaddd %2, %2, [rel PD_DESCALE_P %+ %9] + vpaddd %6, %6, [rel PD_DESCALE_P %+ %9] + vpsrad %2, %2, DESCALE_P %+ %9 + vpsrad %6, %6, DESCALE_P %+ %9 + + vpackssdw %3, %2, %6 ; %6=data2_6 + + ; -- Odd part + + vpaddw %7, %8, %5 ; %7=tmp4_5+tmp6_7=z3_4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + vperm2i128 %2, %7, %7, 0x01 ; %2=z4_3 + vpunpcklwd %6, %7, %2 + vpunpckhwd %7, %7, %2 + vpmaddwd %6, %6, [rel PW_MF078_F117_F078_F117] ; %6=z3_4L + vpmaddwd %7, %7, [rel PW_MF078_F117_F078_F117] ; %7=z3_4H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + vperm2i128 %4, %5, %5, 0x01 ; %4=tmp7_6 + vpunpcklwd %2, %8, %4 + vpunpckhwd %4, %8, %4 + vpmaddwd %2, %2, [rel PW_MF060_MF089_MF050_MF256] ; %2=tmp4_5L + vpmaddwd %4, %4, [rel PW_MF060_MF089_MF050_MF256] ; %4=tmp4_5H + + vpaddd %2, %2, %6 ; %2=data7_5L + vpaddd %4, %4, %7 ; %4=data7_5H + + vpaddd %2, %2, [rel PD_DESCALE_P %+ %9] + vpaddd %4, %4, [rel PD_DESCALE_P %+ %9] + vpsrad %2, %2, DESCALE_P %+ %9 + vpsrad %4, %4, DESCALE_P %+ %9 + + vpackssdw %4, %2, %4 ; %4=data7_5 + + vperm2i128 %2, %8, %8, 0x01 ; %2=tmp5_4 + vpunpcklwd %8, %5, %2 + vpunpckhwd %5, %5, %2 + vpmaddwd %8, %8, [rel PW_F050_MF256_F060_MF089] ; %8=tmp6_7L + vpmaddwd %5, %5, [rel PW_F050_MF256_F060_MF089] ; %5=tmp6_7H + + vpaddd %8, %8, %6 ; %8=data3_1L + vpaddd %5, %5, %7 ; %5=data3_1H + + vpaddd %8, %8, [rel PD_DESCALE_P %+ %9] + vpaddd %5, %5, [rel PD_DESCALE_P %+ %9] + vpsrad %8, %8, DESCALE_P %+ %9 + vpsrad %5, %5, DESCALE_P %+ %9 + + vpackssdw %2, %8, %5 ; %2=data3_1 +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_islow_avx2) + +EXTN(jconst_fdct_islow_avx2): + +PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 + times 4 dw (F_0_541 - F_1_847), F_0_541 +PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 + times 4 dw (F_1_175 - F_0_390), F_1_175 +PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899 + times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_F050_MF256_F060_MF089 times 4 dw (F_3_072 - F_2_562), -F_2_562 + times 4 dw (F_1_501 - F_0_899), -F_0_899 +PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1) +PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1) +PW_1_NEG1 times 8 dw 1 + times 8 dw -1 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_avx2(DCTELEM *data) +; + +; r10 = DCTELEM *data + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_islow_avx2) + +EXTN(jsimd_fdct_islow_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 1 + + ; ---- Pass 1: process rows. + + vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)] + vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)] + vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)] + vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)] + ; ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + ; ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + ; ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + ; ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + vperm2i128 ymm0, ymm4, ymm6, 0x20 + vperm2i128 ymm1, ymm4, ymm6, 0x31 + vperm2i128 ymm2, ymm5, ymm7, 0x20 + vperm2i128 ymm3, ymm5, ymm7, 0x31 + ; ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) + + dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + + dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1 + ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5 + + ; ---- Pass 2: process columns. + + vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7 + vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5 + + dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 + + dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2 + ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5 + + vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1 + vperm2i128 ymm5, ymm2, ymm1, 0x20 ; ymm5=data2_3 + vperm2i128 ymm6, ymm0, ymm4, 0x31 ; ymm6=data4_5 + vperm2i128 ymm7, ymm2, ymm4, 0x21 ; ymm7=data6_7 + + vmovdqu YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm3 + vmovdqu YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm5 + vmovdqu YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm6 + vmovdqu YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7 + + vzeroupper + uncollect_args 1 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jfdctint-sse2.asm b/media/libjpeg/simd/x86_64/jfdctint-sse2.asm new file mode 100644 index 0000000000..ec1f383ccb --- /dev/null +++ b/media/libjpeg/simd/x86_64/jfdctint-sse2.asm @@ -0,0 +1,619 @@ +; +; jfdctint.asm - accurate integer FDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_fdct_islow_sse2) + +EXTN(jconst_fdct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847) +PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) +PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_sse2(DCTELEM *data) +; + +; r10 = DCTELEM *data + +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 6 + + align 32 + GLOBAL_FUNCTION(jsimd_fdct_islow_sse2) + +EXTN(jsimd_fdct_islow_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 1 + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6, xmm1 + movdqa xmm3, xmm0 + psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2, xmm1 + movdqa xmm5, xmm7 + paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm0, xmm6 + paddw xmm3, xmm1 ; xmm3=tmp10 + paddw xmm6, xmm7 ; xmm6=tmp11 + psubw xmm4, xmm1 ; xmm4=tmp13 + psubw xmm0, xmm7 ; xmm0=tmp12 + + movdqa xmm1, xmm3 + paddw xmm3, xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1, xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3, PASS1_BITS ; xmm3=data0 + psllw xmm1, PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7, xmm4 ; xmm4=tmp13 + movdqa xmm6, xmm4 + punpcklwd xmm7, xmm0 ; xmm0=tmp12 + punpckhwd xmm6, xmm0 + movdqa xmm4, xmm7 + movdqa xmm0, xmm6 + pmaddwd xmm7, [rel PW_F130_F054] ; xmm7=data2L + pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=data2H + pmaddwd xmm4, [rel PW_F054_MF130] ; xmm4=data6L + pmaddwd xmm0, [rel PW_F054_MF130] ; xmm0=data6H + + paddd xmm7, [rel PD_DESCALE_P1] + paddd xmm6, [rel PD_DESCALE_P1] + psrad xmm7, DESCALE_P1 + psrad xmm6, DESCALE_P1 + paddd xmm4, [rel PD_DESCALE_P1] + paddd xmm0, [rel PD_DESCALE_P1] + psrad xmm4, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm7, xmm6 ; xmm7=data2 + packssdw xmm4, xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6, xmm2 ; xmm2=tmp4 + movdqa xmm0, xmm5 ; xmm5=tmp5 + paddw xmm6, xmm3 ; xmm6=z3 + paddw xmm0, xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7, xmm6 + movdqa xmm4, xmm6 + punpcklwd xmm7, xmm0 + punpckhwd xmm4, xmm0 + movdqa xmm6, xmm7 + movdqa xmm0, xmm4 + pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3L + pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3H + pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4L + pmaddwd xmm0, [rel PW_F117_F078] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + punpcklwd xmm7, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm2, xmm7 + movdqa xmm1, xmm4 + pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp4L + pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4H + pmaddwd xmm2, [rel PW_MF089_F060] ; xmm2=tmp7L + pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2, xmm6 ; xmm2=data1L + paddd xmm1, xmm0 ; xmm1=data1H + + paddd xmm7, [rel PD_DESCALE_P1] + paddd xmm4, [rel PD_DESCALE_P1] + psrad xmm7, DESCALE_P1 + psrad xmm4, DESCALE_P1 + paddd xmm2, [rel PD_DESCALE_P1] + paddd xmm1, [rel PD_DESCALE_P1] + psrad xmm2, DESCALE_P1 + psrad xmm1, DESCALE_P1 + + packssdw xmm7, xmm4 ; xmm7=data7 + packssdw xmm2, xmm1 ; xmm2=data1 + + movdqa xmm4, xmm5 + movdqa xmm1, xmm5 + punpcklwd xmm4, xmm3 + punpckhwd xmm1, xmm3 + movdqa xmm5, xmm4 + movdqa xmm3, xmm1 + pmaddwd xmm4, [rel PW_MF050_MF256] ; xmm4=tmp5L + pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5H + pmaddwd xmm5, [rel PW_MF256_F050] ; xmm5=tmp6L + pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6H + + paddd xmm4, xmm6 ; xmm4=data5L + paddd xmm1, xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4, [rel PD_DESCALE_P1] + paddd xmm1, [rel PD_DESCALE_P1] + psrad xmm4, DESCALE_P1 + psrad xmm1, DESCALE_P1 + paddd xmm5, [rel PD_DESCALE_P1] + paddd xmm3, [rel PD_DESCALE_P1] + psrad xmm5, DESCALE_P1 + psrad xmm3, DESCALE_P1 + + packssdw xmm4, xmm1 ; xmm4=data5 + packssdw xmm5, xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5, xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2, xmm5 + movdqa xmm7, xmm6 + psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0, xmm5 + movdqa xmm3, xmm4 + paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1, xmm7 + movdqa xmm6, xmm2 + paddw xmm7, xmm5 ; xmm7=tmp10 + paddw xmm2, xmm4 ; xmm2=tmp11 + psubw xmm1, xmm5 ; xmm1=tmp13 + psubw xmm6, xmm4 ; xmm6=tmp12 + + movdqa xmm5, xmm7 + paddw xmm7, xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5, xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7, [rel PW_DESCALE_P2X] + paddw xmm5, [rel PW_DESCALE_P2X] + psraw xmm7, PASS1_BITS ; xmm7=data0 + psraw xmm5, PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4, xmm1 ; xmm1=tmp13 + movdqa xmm2, xmm1 + punpcklwd xmm4, xmm6 ; xmm6=tmp12 + punpckhwd xmm2, xmm6 + movdqa xmm1, xmm4 + movdqa xmm6, xmm2 + pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=data2L + pmaddwd xmm2, [rel PW_F130_F054] ; xmm2=data2H + pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=data6L + pmaddwd xmm6, [rel PW_F054_MF130] ; xmm6=data6H + + paddd xmm4, [rel PD_DESCALE_P2] + paddd xmm2, [rel PD_DESCALE_P2] + psrad xmm4, DESCALE_P2 + psrad xmm2, DESCALE_P2 + paddd xmm1, [rel PD_DESCALE_P2] + paddd xmm6, [rel PD_DESCALE_P2] + psrad xmm1, DESCALE_P2 + psrad xmm6, DESCALE_P2 + + packssdw xmm4, xmm2 ; xmm4=data2 + packssdw xmm1, xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2, xmm0 ; xmm0=tmp4 + movdqa xmm6, xmm3 ; xmm3=tmp5 + paddw xmm2, xmm7 ; xmm2=z3 + paddw xmm6, xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4, xmm2 + movdqa xmm1, xmm2 + punpcklwd xmm4, xmm6 + punpckhwd xmm1, xmm6 + movdqa xmm2, xmm4 + movdqa xmm6, xmm1 + pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3L + pmaddwd xmm1, [rel PW_MF078_F117] ; xmm1=z3H + pmaddwd xmm2, [rel PW_F117_F078] ; xmm2=z4L + pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm4, xmm5 + punpckhwd xmm1, xmm5 + movdqa xmm0, xmm4 + movdqa xmm5, xmm1 + pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4L + pmaddwd xmm1, [rel PW_MF060_MF089] ; xmm1=tmp4H + pmaddwd xmm0, [rel PW_MF089_F060] ; xmm0=tmp7L + pmaddwd xmm5, [rel PW_MF089_F060] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0, xmm2 ; xmm0=data1L + paddd xmm5, xmm6 ; xmm5=data1H + + paddd xmm4, [rel PD_DESCALE_P2] + paddd xmm1, [rel PD_DESCALE_P2] + psrad xmm4, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm0, [rel PD_DESCALE_P2] + paddd xmm5, [rel PD_DESCALE_P2] + psrad xmm0, DESCALE_P2 + psrad xmm5, DESCALE_P2 + + packssdw xmm4, xmm1 ; xmm4=data7 + packssdw xmm0, xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1, xmm3 + movdqa xmm5, xmm3 + punpcklwd xmm1, xmm7 + punpckhwd xmm5, xmm7 + movdqa xmm3, xmm1 + movdqa xmm7, xmm5 + pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5L + pmaddwd xmm5, [rel PW_MF050_MF256] ; xmm5=tmp5H + pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6L + pmaddwd xmm7, [rel PW_MF256_F050] ; xmm7=tmp6H + + paddd xmm1, xmm2 ; xmm1=data5L + paddd xmm5, xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1, [rel PD_DESCALE_P2] + paddd xmm5, [rel PD_DESCALE_P2] + psrad xmm1, DESCALE_P2 + psrad xmm5, DESCALE_P2 + paddd xmm3, [rel PD_DESCALE_P2] + paddd xmm7, [rel PD_DESCALE_P2] + psrad xmm3, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm1, xmm5 ; xmm1=data5 + packssdw xmm3, xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 + + uncollect_args 1 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jidctflt-sse2.asm b/media/libjpeg/simd/x86_64/jidctflt-sse2.asm new file mode 100644 index 0000000000..ab95e1a6d6 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jidctflt-sse2.asm @@ -0,0 +1,481 @@ +; +; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_float_sse2) + +EXTN(jconst_idct_float_sse2): + +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = void *dct_table +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13d = JDIMENSION output_col + +%define original_rbp rbp + 0 +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 32 + GLOBAL_FUNCTION(jsimd_idct_float_sse2) + +EXTN(jsimd_idct_float_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [workspace] + collect_args 4 + push rbx + + ; ---- Pass 1: process columns from input, store into work array. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + lea rdi, [workspace] ; FAST_FLOAT *wsptr + mov rcx, DCTSIZE/4 ; ctr +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1, xmm2 + por xmm3, xmm4 + por xmm5, xmm6 + por xmm1, xmm3 + por xmm5, xmm7 + por xmm1, xmm5 + packsswb xmm1, xmm1 + movd eax, xmm1 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1, xmm0 + movaps xmm2, xmm0 + movaps xmm3, xmm0 + + shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) + shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) + shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) + shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn +%endif +.columnDCT: + + ; -- Even part + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [rel PD_1_414] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [rel PD_1_847] ; xmm0=z5 + mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) + movaps xmm3, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm0, xmm7 + movaps xmm3, xmm5 + addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2, xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) + movaps xmm4, xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) + movaps xmm0, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6, xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) + movaps xmm3, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 + +.nextcolumn: + add rsi, byte 4*SIZEOF_JCOEF ; coef_block + add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec rcx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + lea rsi, [workspace] ; FAST_FLOAT *wsptr + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + mov rcx, DCTSIZE/4 ; ctr +.rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [rel PD_1_414] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [rel PD_1_847] ; xmm0=z5 + mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] + pcmpeqd xmm3, xmm3 + psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm7, xmm1 + movaps xmm5, xmm3 + addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] + pcmpeqd xmm4, xmm4 + psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] + + packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6, xmm2 + paddb xmm1, xmm2 + + movdqa xmm4, xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 + + add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add rdi, byte 4*SIZEOF_JSAMPROW + dec rcx ; ctr + jnz near .rowloop + + pop rbx + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jidctfst-sse2.asm b/media/libjpeg/simd/x86_64/jidctfst-sse2.asm new file mode 100644 index 0000000000..a66a6811e9 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jidctfst-sse2.asm @@ -0,0 +1,490 @@ +; +; jidctfst.asm - fast integer IDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the inverse DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jidctfst.c; see the jidctfst.c +; for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 + +%if IFAST_SCALE_BITS != PASS1_BITS +%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." +%endif + +%if CONST_BITS == 8 +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 32 + GLOBAL_DATA(jconst_idct_ifast_sse2) + +EXTN(jconst_idct_ifast_sse2): + +PW_F1414 times 8 dw F_1_414 << CONST_SHIFT +PW_F1847 times 8 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 8 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = jpeg_component_info *compptr +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13d = JDIMENSION output_col + +%define original_rbp rbp + 0 +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_ifast_sse2) + +EXTN(jsimd_idct_ifast_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 4 + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 + mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1, xmm0 + packsswb xmm1, xmm1 + packsswb xmm1, xmm1 + movd eax, xmm1 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + psubw xmm0, xmm2 ; xmm0=tmp11 + psubw xmm1, xmm3 + paddw xmm4, xmm2 ; xmm4=tmp10 + paddw xmm5, xmm3 ; xmm5=tmp13 + + psllw xmm1, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1, [rel PW_F1414] + psubw xmm1, xmm5 ; xmm1=tmp12 + + movdqa xmm6, xmm4 + movdqa xmm7, xmm0 + psubw xmm4, xmm5 ; xmm4=tmp3 + psubw xmm0, xmm1 ; xmm0=tmp2 + paddw xmm6, xmm5 ; xmm6=tmp0 + paddw xmm7, xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4, xmm2 + movdqa xmm0, xmm5 + psubw xmm2, xmm1 ; xmm2=z12 + psubw xmm5, xmm3 ; xmm5=z10 + paddw xmm4, xmm1 ; xmm4=z11 + paddw xmm0, xmm3 ; xmm0=z13 + + movdqa xmm1, xmm5 ; xmm1=z10(unscaled) + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3, xmm4 + psubw xmm4, xmm0 + paddw xmm3, xmm0 ; xmm3=tmp7 + + psllw xmm4, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0, xmm5 + paddw xmm5, xmm2 + pmulhw xmm5, [rel PW_F1847] ; xmm5=z5 + pmulhw xmm0, [rel PW_MF1613] + pmulhw xmm2, [rel PW_F1082] + psubw xmm0, xmm1 + psubw xmm2, xmm5 ; xmm2=tmp10 + paddw xmm0, xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0, xmm3 ; xmm0=tmp6 + movdqa xmm1, xmm6 + movdqa xmm5, xmm7 + paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4, xmm0 ; xmm4=tmp5 + + movdqa xmm3, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2, xmm4 ; xmm2=tmp4 + movdqa xmm5, xmm7 + movdqa xmm0, xmm1 + paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0, xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4, xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7, xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7, xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2, xmm6 + movdqa xmm0, xmm5 + psubw xmm6, xmm1 ; xmm6=tmp11 + psubw xmm5, xmm3 + paddw xmm2, xmm1 ; xmm2=tmp10 + paddw xmm0, xmm3 ; xmm0=tmp13 + + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [rel PW_F1414] + psubw xmm5, xmm0 ; xmm5=tmp12 + + movdqa xmm1, xmm2 + movdqa xmm3, xmm6 + psubw xmm2, xmm0 ; xmm2=tmp3 + psubw xmm6, xmm5 ; xmm6=tmp2 + paddw xmm1, xmm0 ; xmm1=tmp0 + paddw xmm3, xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2, xmm0 + movdqa xmm6, xmm4 + psubw xmm0, xmm7 ; xmm0=z12 + psubw xmm4, xmm5 ; xmm4=z10 + paddw xmm2, xmm7 ; xmm2=z11 + paddw xmm6, xmm5 ; xmm6=z13 + + movdqa xmm7, xmm4 ; xmm7=z10(unscaled) + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + psllw xmm4, PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5, xmm2 + psubw xmm2, xmm6 + paddw xmm5, xmm6 ; xmm5=tmp7 + + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6, xmm4 + paddw xmm4, xmm0 + pmulhw xmm4, [rel PW_F1847] ; xmm4=z5 + pmulhw xmm6, [rel PW_MF1613] + pmulhw xmm0, [rel PW_F1082] + psubw xmm6, xmm7 + psubw xmm0, xmm4 ; xmm0=tmp10 + paddw xmm6, xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6, xmm5 ; xmm6=tmp6 + movdqa xmm7, xmm1 + movdqa xmm4, xmm3 + paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1, (PASS1_BITS+3) ; descale + psraw xmm3, (PASS1_BITS+3) ; descale + psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7, (PASS1_BITS+3) ; descale + psraw xmm4, (PASS1_BITS+3) ; descale + psubw xmm2, xmm6 ; xmm2=tmp5 + + packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0, xmm2 ; xmm0=tmp4 + movdqa xmm4, xmm5 + movdqa xmm7, xmm6 + paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5, (PASS1_BITS+3) ; descale + psraw xmm6, (PASS1_BITS+3) ; descale + psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4, (PASS1_BITS+3) ; descale + psraw xmm7, (PASS1_BITS+3) ; descale + + movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] + + packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1, xmm2 + paddb xmm3, xmm2 + paddb xmm5, xmm2 + paddb xmm7, xmm2 + + movdqa xmm0, xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6, xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4, xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2, xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3, xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7, xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 + + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 + mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 + + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jidctint-avx2.asm b/media/libjpeg/simd/x86_64/jidctint-avx2.asm new file mode 100644 index 0000000000..9c38f9e390 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jidctint-avx2.asm @@ -0,0 +1,417 @@ +; +; jidctint.asm - accurate integer IDCT (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions +; %1-%4: Input/output registers +; %5-%8: Temp registers + +%macro dotranspose 8 + ; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71) + ; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) + ; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) + ; %8=(07 17 27 37 47 57 67 77 06 16 26 36 46 56 66 76) + + vpermq %5, %1, 0xD8 + vpermq %6, %2, 0x72 + vpermq %7, %3, 0xD8 + vpermq %8, %4, 0x72 + ; transpose coefficients(phase 1) + ; %5=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71) + ; %6=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73) + ; %7=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75) + ; %8=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77) + + vpunpcklwd %1, %5, %6 + vpunpckhwd %2, %5, %6 + vpunpcklwd %3, %7, %8 + vpunpckhwd %4, %7, %8 + ; transpose coefficients(phase 2) + ; %1=(00 02 10 12 20 22 30 32 40 42 50 52 60 62 70 72) + ; %2=(01 03 11 13 21 23 31 33 41 43 51 53 61 63 71 73) + ; %3=(04 06 14 16 24 26 34 36 44 46 54 56 64 66 74 76) + ; %4=(05 07 15 17 25 27 35 37 45 47 55 57 65 67 75 77) + + vpunpcklwd %5, %1, %2 + vpunpcklwd %6, %3, %4 + vpunpckhwd %7, %1, %2 + vpunpckhwd %8, %3, %4 + ; transpose coefficients(phase 3) + ; %5=(00 01 02 03 10 11 12 13 40 41 42 43 50 51 52 53) + ; %6=(04 05 06 07 14 15 16 17 44 45 46 47 54 55 56 57) + ; %7=(20 21 22 23 30 31 32 33 60 61 62 63 70 71 72 73) + ; %8=(24 25 26 27 34 35 36 37 64 65 66 67 74 75 76 77) + + vpunpcklqdq %1, %5, %6 + vpunpckhqdq %2, %5, %6 + vpunpcklqdq %3, %7, %8 + vpunpckhqdq %4, %7, %8 + ; transpose coefficients(phase 4) + ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) + ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) + ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) + ; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) +%endmacro + +; -------------------------------------------------------------------------- +; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions +; %1-%4: Input/output registers +; %5-%12: Temp registers +; %9: Pass (1 or 2) + +%macro dodct 13 + ; -- Even part + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + vperm2i128 %6, %3, %3, 0x01 ; %6=in6_2 + vpunpcklwd %5, %3, %6 ; %5=in26_62L + vpunpckhwd %6, %3, %6 ; %6=in26_62H + vpmaddwd %5, %5, [rel PW_F130_F054_MF130_F054] ; %5=tmp3_2L + vpmaddwd %6, %6, [rel PW_F130_F054_MF130_F054] ; %6=tmp3_2H + + vperm2i128 %7, %1, %1, 0x01 ; %7=in4_0 + vpsignw %1, %1, [rel PW_1_NEG1] + vpaddw %7, %7, %1 ; %7=(in0+in4)_(in0-in4) + + vpxor %1, %1, %1 + vpunpcklwd %8, %1, %7 ; %8=tmp0_1L + vpunpckhwd %1, %1, %7 ; %1=tmp0_1H + vpsrad %8, %8, (16-CONST_BITS) ; vpsrad %8,16 & vpslld %8,CONST_BITS + vpsrad %1, %1, (16-CONST_BITS) ; vpsrad %1,16 & vpslld %1,CONST_BITS + + vpsubd %11, %8, %5 ; %11=tmp0_1L-tmp3_2L=tmp13_12L + vpaddd %9, %8, %5 ; %9=tmp0_1L+tmp3_2L=tmp10_11L + vpsubd %12, %1, %6 ; %12=tmp0_1H-tmp3_2H=tmp13_12H + vpaddd %10, %1, %6 ; %10=tmp0_1H+tmp3_2H=tmp10_11H + + ; -- Odd part + + vpaddw %1, %4, %2 ; %1=in7_5+in3_1=z3_4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + vperm2i128 %8, %1, %1, 0x01 ; %8=z4_3 + vpunpcklwd %7, %1, %8 ; %7=z34_43L + vpunpckhwd %8, %1, %8 ; %8=z34_43H + vpmaddwd %7, %7, [rel PW_MF078_F117_F078_F117] ; %7=z3_4L + vpmaddwd %8, %8, [rel PW_MF078_F117_F078_F117] ; %8=z3_4H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + vperm2i128 %2, %2, %2, 0x01 ; %2=in1_3 + vpunpcklwd %3, %4, %2 ; %3=in71_53L + vpunpckhwd %4, %4, %2 ; %4=in71_53H + + vpmaddwd %5, %3, [rel PW_MF060_MF089_MF050_MF256] ; %5=tmp0_1L + vpmaddwd %6, %4, [rel PW_MF060_MF089_MF050_MF256] ; %6=tmp0_1H + vpaddd %5, %5, %7 ; %5=tmp0_1L+z3_4L=tmp0_1L + vpaddd %6, %6, %8 ; %6=tmp0_1H+z3_4H=tmp0_1H + + vpmaddwd %3, %3, [rel PW_MF089_F060_MF256_F050] ; %3=tmp3_2L + vpmaddwd %4, %4, [rel PW_MF089_F060_MF256_F050] ; %4=tmp3_2H + vperm2i128 %7, %7, %7, 0x01 ; %7=z4_3L + vperm2i128 %8, %8, %8, 0x01 ; %8=z4_3H + vpaddd %7, %3, %7 ; %7=tmp3_2L+z4_3L=tmp3_2L + vpaddd %8, %4, %8 ; %8=tmp3_2H+z4_3H=tmp3_2H + + ; -- Final output stage + + vpaddd %1, %9, %7 ; %1=tmp10_11L+tmp3_2L=data0_1L + vpaddd %2, %10, %8 ; %2=tmp10_11H+tmp3_2H=data0_1H + vpaddd %1, %1, [rel PD_DESCALE_P %+ %13] + vpaddd %2, %2, [rel PD_DESCALE_P %+ %13] + vpsrad %1, %1, DESCALE_P %+ %13 + vpsrad %2, %2, DESCALE_P %+ %13 + vpackssdw %1, %1, %2 ; %1=data0_1 + + vpsubd %3, %9, %7 ; %3=tmp10_11L-tmp3_2L=data7_6L + vpsubd %4, %10, %8 ; %4=tmp10_11H-tmp3_2H=data7_6H + vpaddd %3, %3, [rel PD_DESCALE_P %+ %13] + vpaddd %4, %4, [rel PD_DESCALE_P %+ %13] + vpsrad %3, %3, DESCALE_P %+ %13 + vpsrad %4, %4, DESCALE_P %+ %13 + vpackssdw %4, %3, %4 ; %4=data7_6 + + vpaddd %7, %11, %5 ; %7=tmp13_12L+tmp0_1L=data3_2L + vpaddd %8, %12, %6 ; %8=tmp13_12H+tmp0_1H=data3_2H + vpaddd %7, %7, [rel PD_DESCALE_P %+ %13] + vpaddd %8, %8, [rel PD_DESCALE_P %+ %13] + vpsrad %7, %7, DESCALE_P %+ %13 + vpsrad %8, %8, DESCALE_P %+ %13 + vpackssdw %2, %7, %8 ; %2=data3_2 + + vpsubd %7, %11, %5 ; %7=tmp13_12L-tmp0_1L=data4_5L + vpsubd %8, %12, %6 ; %8=tmp13_12H-tmp0_1H=data4_5H + vpaddd %7, %7, [rel PD_DESCALE_P %+ %13] + vpaddd %8, %8, [rel PD_DESCALE_P %+ %13] + vpsrad %7, %7, DESCALE_P %+ %13 + vpsrad %8, %8, DESCALE_P %+ %13 + vpackssdw %3, %7, %8 ; %3=data4_5 +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_islow_avx2) + +EXTN(jconst_idct_islow_avx2): + +PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 + times 4 dw (F_0_541 - F_1_847), F_0_541 +PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 + times 4 dw (F_1_175 - F_0_390), F_1_175 +PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899 + times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF089_F060_MF256_F050 times 4 dw -F_0_899, (F_1_501 - F_0_899) + times 4 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1) +PB_CENTERJSAMP times 32 db CENTERJSAMPLE +PW_1_NEG1 times 8 dw 1 + times 8 dw -1 + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = jpeg_component_info *compptr +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13d = JDIMENSION output_col + + align 32 + GLOBAL_FUNCTION(jsimd_idct_islow_avx2) + +EXTN(jsimd_idct_islow_avx2): + push rbp + mov rax, rsp ; rax = original rbp + mov rbp, rsp ; rbp = aligned rbp + push_xmm 4 + collect_args 4 + + ; ---- Pass 1: process columns. + +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2 + mov eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,r11,SIZEOF_JCOEF)] + vpor xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,r11,SIZEOF_JCOEF)] + vpor xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,r11,SIZEOF_JCOEF)] + vpor xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,r11,SIZEOF_JCOEF)] + vpor xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,r11,SIZEOF_JCOEF)] + vpor xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,r11,SIZEOF_JCOEF)] + vpor xmm1, xmm1, xmm0 + vpacksswb xmm1, xmm1, xmm1 + vpacksswb xmm1, xmm1, xmm1 + movd eax, xmm1 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,r11,SIZEOF_JCOEF)] + vpmullw xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)] + + vpsllw xmm5, xmm5, PASS1_BITS + + vpunpcklwd xmm4, xmm5, xmm5 ; xmm4=(00 00 01 01 02 02 03 03) + vpunpckhwd xmm5, xmm5, xmm5 ; xmm5=(04 04 05 05 06 06 07 07) + vinserti128 ymm4, ymm4, xmm5, 1 + + vpshufd ymm0, ymm4, 0x00 ; ymm0=col0_4=(00 00 00 00 00 00 00 00 04 04 04 04 04 04 04 04) + vpshufd ymm1, ymm4, 0x55 ; ymm1=col1_5=(01 01 01 01 01 01 01 01 05 05 05 05 05 05 05 05) + vpshufd ymm2, ymm4, 0xAA ; ymm2=col2_6=(02 02 02 02 02 02 02 02 06 06 06 06 06 06 06 06) + vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07) + + jmp near .column_end +%endif +.columnDCT: + + vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,r11,SIZEOF_JCOEF)] ; ymm4=in0_1 + vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,r11,SIZEOF_JCOEF)] ; ymm5=in2_3 + vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,r11,SIZEOF_JCOEF)] ; ymm6=in4_5 + vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,r11,SIZEOF_JCOEF)] ; ymm7=in6_7 + vpmullw ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)] + vpmullw ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_ISLOW_MULT_TYPE)] + vpmullw ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_ISLOW_MULT_TYPE)] + vpmullw ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_ISLOW_MULT_TYPE)] + + vperm2i128 ymm0, ymm4, ymm6, 0x20 ; ymm0=in0_4 + vperm2i128 ymm1, ymm5, ymm4, 0x31 ; ymm1=in3_1 + vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6 + vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5 + + dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1 + ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6 + + dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7 + +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows. + + vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5 + vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1 + + dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2 + ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6 + + dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 + ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7 + + vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45 + vpacksswb ymm1, ymm2, ymm4 ; ymm1=data23_67 + vpaddb ymm0, ymm0, [rel PB_CENTERJSAMP] + vpaddb ymm1, ymm1, [rel PB_CENTERJSAMP] + + vextracti128 xmm6, ymm1, 1 ; xmm3=data67 + vextracti128 xmm4, ymm0, 1 ; xmm2=data45 + vextracti128 xmm2, ymm1, 0 ; xmm1=data23 + vextracti128 xmm0, ymm0, 0 ; xmm0=data01 + + vpshufd xmm1, xmm0, 0x4E ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + vpshufd xmm3, xmm2, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + vpshufd xmm5, xmm4, 0x4E ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + vpshufd xmm7, xmm6, 0x4E ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + vzeroupper + + mov eax, r13d + + mov rdx, JSAMPROW [r12+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rsi, JSAMPROW [r12+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 + + mov rdx, JSAMPROW [r12+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rsi, JSAMPROW [r12+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + + mov rdx, JSAMPROW [r12+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rsi, JSAMPROW [r12+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 + + mov rdx, JSAMPROW [r12+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rsi, JSAMPROW [r12+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 + + uncollect_args 4 + pop_xmm 4 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jidctint-sse2.asm b/media/libjpeg/simd/x86_64/jidctint-sse2.asm new file mode 100644 index 0000000000..8983bf099c --- /dev/null +++ b/media/libjpeg/simd/x86_64/jidctint-sse2.asm @@ -0,0 +1,846 @@ +; +; jidctint.asm - accurate integer IDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, 2020, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slower but more accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS - PASS1_BITS) +%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_islow_sse2) + +EXTN(jconst_idct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847) +PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = jpeg_component_info *compptr +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13d = JDIMENSION output_col + +%define original_rbp rbp + 0 +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 12 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_islow_sse2) + +EXTN(jsimd_idct_islow_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 4 + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 + mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1, xmm0 + packsswb xmm1, xmm1 + packsswb xmm1, xmm1 + movd eax, xmm1 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5, PASS1_BITS + + movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4, xmm1 ; xmm1=in2=z2 + movdqa xmm5, xmm1 + punpcklwd xmm4, xmm3 ; xmm3=in6=z3 + punpckhwd xmm5, xmm3 + movdqa xmm1, xmm4 + movdqa xmm3, xmm5 + pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=tmp3L + pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H + pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L + pmaddwd xmm3, [rel PW_F054_MF130] ; xmm3=tmp2H + + movdqa xmm6, xmm0 + paddw xmm0, xmm2 ; xmm0=in0+in4 + psubw xmm6, xmm2 ; xmm6=in0-in4 + + pxor xmm7, xmm7 + pxor xmm2, xmm2 + punpcklwd xmm7, xmm0 ; xmm7=tmp0L + punpckhwd xmm2, xmm0 ; xmm2=tmp0H + psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0, xmm7 + paddd xmm7, xmm4 ; xmm7=tmp10L + psubd xmm0, xmm4 ; xmm0=tmp13L + movdqa xmm4, xmm2 + paddd xmm2, xmm5 ; xmm2=tmp10H + psubd xmm4, xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5, xmm5 + pxor xmm7, xmm7 + punpcklwd xmm5, xmm6 ; xmm5=tmp1L + punpckhwd xmm7, xmm6 ; xmm7=tmp1H + psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2, xmm5 + paddd xmm5, xmm1 ; xmm5=tmp11L + psubd xmm2, xmm1 ; xmm2=tmp12L + movdqa xmm0, xmm7 + paddd xmm7, xmm3 ; xmm7=tmp11H + psubd xmm0, xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5, xmm6 + movdqa xmm7, xmm4 + paddw xmm5, xmm3 ; xmm5=z3 + paddw xmm7, xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2, xmm5 + movdqa xmm0, xmm5 + punpcklwd xmm2, xmm7 + punpckhwd xmm0, xmm7 + movdqa xmm5, xmm2 + movdqa xmm7, xmm0 + pmaddwd xmm2, [rel PW_MF078_F117] ; xmm2=z3L + pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3H + pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L + pmaddwd xmm7, [rel PW_F117_F078] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2, xmm3 + movdqa xmm0, xmm3 + punpcklwd xmm2, xmm4 + punpckhwd xmm0, xmm4 + movdqa xmm3, xmm2 + movdqa xmm4, xmm0 + pmaddwd xmm2, [rel PW_MF060_MF089] ; xmm2=tmp0L + pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0H + pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3L + pmaddwd xmm4, [rel PW_MF089_F060] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3, xmm5 ; xmm3=tmp3L + paddd xmm4, xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2, xmm1 + movdqa xmm0, xmm1 + punpcklwd xmm2, xmm6 + punpckhwd xmm0, xmm6 + movdqa xmm1, xmm2 + movdqa xmm6, xmm0 + pmaddwd xmm2, [rel PW_MF050_MF256] ; xmm2=tmp1L + pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1H + pmaddwd xmm1, [rel PW_MF256_F050] ; xmm1=tmp2L + pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H + + paddd xmm2, xmm5 ; xmm2=tmp1L + paddd xmm0, xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2, xmm5 + movdqa xmm0, xmm7 + paddd xmm5, xmm3 ; xmm5=data0L + paddd xmm7, xmm4 ; xmm7=data0H + psubd xmm2, xmm3 ; xmm2=data7L + psubd xmm0, xmm4 ; xmm0=data7H + + movdqa xmm3, [rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1] + + paddd xmm5, xmm3 + paddd xmm7, xmm3 + psrad xmm5, DESCALE_P1 + psrad xmm7, DESCALE_P1 + paddd xmm2, xmm3 + paddd xmm0, xmm3 + psrad xmm2, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7, xmm4 + movdqa xmm0, xmm3 + paddd xmm4, xmm1 ; xmm4=data1L + paddd xmm3, xmm6 ; xmm3=data1H + psubd xmm7, xmm1 ; xmm7=data6L + psubd xmm0, xmm6 ; xmm0=data6H + + movdqa xmm1, [rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1] + + paddd xmm4, xmm1 + paddd xmm3, xmm1 + psrad xmm4, DESCALE_P1 + psrad xmm3, DESCALE_P1 + paddd xmm7, xmm1 + paddd xmm0, xmm1 + psrad xmm7, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5, xmm3 + movdqa xmm6, xmm0 + paddd xmm3, xmm4 ; xmm3=data2L + paddd xmm0, xmm2 ; xmm0=data2H + psubd xmm5, xmm4 ; xmm5=data5L + psubd xmm6, xmm2 ; xmm6=data5H + + movdqa xmm7, [rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1] + + paddd xmm3, xmm7 + paddd xmm0, xmm7 + psrad xmm3, DESCALE_P1 + psrad xmm0, DESCALE_P1 + paddd xmm5, xmm7 + paddd xmm6, xmm7 + psrad xmm5, DESCALE_P1 + psrad xmm6, DESCALE_P1 + + packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0, xmm1 + movdqa xmm6, xmm4 + paddd xmm1, xmm2 ; xmm1=data3L + paddd xmm4, xmm7 ; xmm4=data3H + psubd xmm0, xmm2 ; xmm0=data4L + psubd xmm6, xmm7 ; xmm6=data4H + + movdqa xmm2, [rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1] + + paddd xmm1, xmm2 + paddd xmm4, xmm2 + psrad xmm1, DESCALE_P1 + psrad xmm4, DESCALE_P1 + paddd xmm0, xmm2 + paddd xmm6, xmm2 + psrad xmm0, DESCALE_P1 + psrad xmm6, DESCALE_P1 + + packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4, xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1, xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6, xmm1 ; xmm1=in2=z2 + movdqa xmm5, xmm1 + punpcklwd xmm6, xmm2 ; xmm2=in6=z3 + punpckhwd xmm5, xmm2 + movdqa xmm1, xmm6 + movdqa xmm2, xmm5 + pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=tmp3L + pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H + pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L + pmaddwd xmm2, [rel PW_F054_MF130] ; xmm2=tmp2H + + movdqa xmm3, xmm7 + paddw xmm7, xmm0 ; xmm7=in0+in4 + psubw xmm3, xmm0 ; xmm3=in0-in4 + + pxor xmm4, xmm4 + pxor xmm0, xmm0 + punpcklwd xmm4, xmm7 ; xmm4=tmp0L + punpckhwd xmm0, xmm7 ; xmm0=tmp0H + psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7, xmm4 + paddd xmm4, xmm6 ; xmm4=tmp10L + psubd xmm7, xmm6 ; xmm7=tmp13L + movdqa xmm6, xmm0 + paddd xmm0, xmm5 ; xmm0=tmp10H + psubd xmm6, xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5, xmm5 + pxor xmm4, xmm4 + punpcklwd xmm5, xmm3 ; xmm5=tmp1L + punpckhwd xmm4, xmm3 ; xmm4=tmp1H + psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0, xmm5 + paddd xmm5, xmm1 ; xmm5=tmp11L + psubd xmm0, xmm1 ; xmm0=tmp12L + movdqa xmm7, xmm4 + paddd xmm4, xmm2 ; xmm4=tmp11H + psubd xmm7, xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5, xmm6 + movdqa xmm4, xmm3 + paddw xmm5, xmm1 ; xmm5=z3 + paddw xmm4, xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0, xmm5 + movdqa xmm7, xmm5 + punpcklwd xmm0, xmm4 + punpckhwd xmm7, xmm4 + movdqa xmm5, xmm0 + movdqa xmm4, xmm7 + pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3L + pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3H + pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L + pmaddwd xmm4, [rel PW_F117_F078] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0, xmm1 + movdqa xmm7, xmm1 + punpcklwd xmm0, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm1, xmm0 + movdqa xmm3, xmm7 + pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0L + pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp0H + pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp3L + pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1, xmm5 ; xmm1=tmp3L + paddd xmm3, xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0, xmm2 + movdqa xmm7, xmm2 + punpcklwd xmm0, xmm6 + punpckhwd xmm7, xmm6 + movdqa xmm2, xmm0 + movdqa xmm6, xmm7 + pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1L + pmaddwd xmm7, [rel PW_MF050_MF256] ; xmm7=tmp1H + pmaddwd xmm2, [rel PW_MF256_F050] ; xmm2=tmp2L + pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H + + paddd xmm0, xmm5 ; xmm0=tmp1L + paddd xmm7, xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0, xmm5 + movdqa xmm7, xmm4 + paddd xmm5, xmm1 ; xmm5=data0L + paddd xmm4, xmm3 ; xmm4=data0H + psubd xmm0, xmm1 ; xmm0=data7L + psubd xmm7, xmm3 ; xmm7=data7H + + movdqa xmm1, [rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2] + + paddd xmm5, xmm1 + paddd xmm4, xmm1 + psrad xmm5, DESCALE_P2 + psrad xmm4, DESCALE_P2 + paddd xmm0, xmm1 + paddd xmm7, xmm1 + psrad xmm0, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4, xmm3 + movdqa xmm7, xmm1 + paddd xmm3, xmm2 ; xmm3=data1L + paddd xmm1, xmm6 ; xmm1=data1H + psubd xmm4, xmm2 ; xmm4=data6L + psubd xmm7, xmm6 ; xmm7=data6H + + movdqa xmm2, [rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2] + + paddd xmm3, xmm2 + paddd xmm1, xmm2 + psrad xmm3, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm4, xmm2 + paddd xmm7, xmm2 + psrad xmm4, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4, xmm6 + movdqa xmm0, xmm2 + paddd xmm6, xmm1 ; xmm6=data2L + paddd xmm2, xmm7 ; xmm2=data2H + psubd xmm4, xmm1 ; xmm4=data5L + psubd xmm0, xmm7 ; xmm0=data5H + + movdqa xmm5, [rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2] + + paddd xmm6, xmm5 + paddd xmm2, xmm5 + psrad xmm6, DESCALE_P2 + psrad xmm2, DESCALE_P2 + paddd xmm4, xmm5 + paddd xmm0, xmm5 + psrad xmm4, DESCALE_P2 + psrad xmm0, DESCALE_P2 + + packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2, xmm3 + movdqa xmm0, xmm1 + paddd xmm3, xmm7 ; xmm3=data3L + paddd xmm1, xmm5 ; xmm1=data3H + psubd xmm2, xmm7 ; xmm2=data4L + psubd xmm0, xmm5 ; xmm0=data4H + + movdqa xmm7, [rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2] + + paddd xmm3, xmm7 + paddd xmm1, xmm7 + psrad xmm3, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm2, xmm7 + paddd xmm0, xmm7 + psrad xmm2, DESCALE_P2 + psrad xmm0, DESCALE_P2 + + movdqa xmm5, [rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP] + + packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7, xmm5 + paddb xmm1, xmm5 + paddb xmm6, xmm5 + paddb xmm3, xmm5 + + movdqa xmm0, xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4, xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5, xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3, xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 + mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 + mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 + + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jidctred-sse2.asm b/media/libjpeg/simd/x86_64/jidctred-sse2.asm new file mode 100644 index 0000000000..7fbfcc519d --- /dev/null +++ b/media/libjpeg/simd/x86_64/jidctred-sse2.asm @@ -0,0 +1,573 @@ +; +; jidctred.asm - reduced-size IDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains inverse-DCT routines that produce reduced-size +; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. +; The following code is based directly on the IJG's original jidctred.c; +; see the jidctred.c for more details. + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1_4 (CONST_BITS - PASS1_BITS + 1) +%define DESCALE_P2_4 (CONST_BITS + PASS1_BITS + 3 + 1) +%define DESCALE_P1_2 (CONST_BITS - PASS1_BITS + 2) +%define DESCALE_P2_2 (CONST_BITS + PASS1_BITS + 3 + 2) + +%if CONST_BITS == 13 +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) +F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + GLOBAL_DATA(jconst_idct_red_sse2) + +EXTN(jconst_idct_red_sse2): + +PW_F184_MF076 times 4 dw F_1_847, -F_0_765 +PW_F256_F089 times 4 dw F_2_562, F_0_899 +PW_F106_MF217 times 4 dw F_1_061, -F_2_172 +PW_MF060_MF050 times 4 dw -F_0_601, -F_0_509 +PW_F145_MF021 times 4 dw F_1_451, -F_0_211 +PW_F362_MF127 times 4 dw F_3_624, -F_1_272 +PW_F085_MF072 times 4 dw F_0_850, -F_0_720 +PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4 - 1) +PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4 - 1) +PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1) +PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 4x4 output block. +; +; GLOBAL(void) +; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = void *dct_table +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13d = JDIMENSION output_col + +%define original_rbp rbp + 0 +%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD + ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + GLOBAL_FUNCTION(jsimd_idct_4x4_sse2) + +EXTN(jsimd_idct_4x4_sse2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 4 + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 + mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm0, xmm1 + packsswb xmm0, xmm0 + packsswb xmm0, xmm0 + movd eax, xmm0 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0, PASS1_BITS + + movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end +%endif +.columnDCT: + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4, xmm0 + movdqa xmm5, xmm0 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + pmaddwd xmm4, [rel PW_F256_F089] ; xmm4=(tmp2L) + pmaddwd xmm5, [rel PW_F256_F089] ; xmm5=(tmp2H) + pmaddwd xmm0, [rel PW_F106_MF217] ; xmm0=(tmp0L) + pmaddwd xmm1, [rel PW_F106_MF217] ; xmm1=(tmp0H) + + movdqa xmm6, xmm2 + movdqa xmm7, xmm2 + punpcklwd xmm6, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm2, xmm6 + movdqa xmm3, xmm7 + pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2L) + pmaddwd xmm7, [rel PW_MF060_MF050] ; xmm7=(tmp2H) + pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0L) + pmaddwd xmm3, [rel PW_F145_MF021] ; xmm3=(tmp0H) + + paddd xmm6, xmm4 ; xmm6=tmp2L + paddd xmm7, xmm5 ; xmm7=tmp2H + paddd xmm2, xmm0 ; xmm2=tmp0L + paddd xmm3, xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1, xmm1 + pxor xmm2, xmm2 + punpcklwd xmm1, xmm4 ; xmm1=tmp0L + punpckhwd xmm2, xmm4 ; xmm2=tmp0H + psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3, xmm5 ; xmm5=in2=z2 + punpcklwd xmm5, xmm0 ; xmm0=in6=z3 + punpckhwd xmm3, xmm0 + pmaddwd xmm5, [rel PW_F184_MF076] ; xmm5=tmp2L + pmaddwd xmm3, [rel PW_F184_MF076] ; xmm3=tmp2H + + movdqa xmm4, xmm1 + movdqa xmm0, xmm2 + paddd xmm1, xmm5 ; xmm1=tmp10L + paddd xmm2, xmm3 ; xmm2=tmp10H + psubd xmm4, xmm5 ; xmm4=tmp12L + psubd xmm0, xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5, xmm1 + movdqa xmm3, xmm2 + paddd xmm1, xmm6 ; xmm1=data0L + paddd xmm2, xmm7 ; xmm2=data0H + psubd xmm5, xmm6 ; xmm5=data3L + psubd xmm3, xmm7 ; xmm3=data3H + + movdqa xmm6, [rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4] + + paddd xmm1, xmm6 + paddd xmm2, xmm6 + psrad xmm1, DESCALE_P1_4 + psrad xmm2, DESCALE_P1_4 + paddd xmm5, xmm6 + paddd xmm3, xmm6 + psrad xmm5, DESCALE_P1_4 + psrad xmm3, DESCALE_P1_4 + + packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2, xmm4 + movdqa xmm3, xmm0 + paddd xmm4, xmm7 ; xmm4=data1L + paddd xmm0, xmm6 ; xmm0=data1H + psubd xmm2, xmm7 ; xmm2=data2L + psubd xmm3, xmm6 ; xmm3=data2H + + movdqa xmm7, [rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4] + + paddd xmm4, xmm7 + paddd xmm0, xmm7 + psrad xmm4, DESCALE_P1_4 + psrad xmm0, DESCALE_P1_4 + paddd xmm2, xmm7 + paddd xmm3, xmm7 + psrad xmm2, DESCALE_P1_4 + psrad xmm3, DESCALE_P1_4 + + packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + pxor xmm4, xmm4 + punpcklwd xmm4, xmm1 ; xmm4=tmp0 + psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + + ; -- Odd part + + punpckhwd xmm1, xmm0 + punpckhwd xmm6, xmm3 + movdqa xmm5, xmm1 + movdqa xmm2, xmm6 + pmaddwd xmm1, [rel PW_F256_F089] ; xmm1=(tmp2) + pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2) + pmaddwd xmm5, [rel PW_F106_MF217] ; xmm5=(tmp0) + pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0) + + paddd xmm6, xmm1 ; xmm6=tmp2 + paddd xmm2, xmm5 ; xmm2=tmp0 + + ; -- Even part + + punpcklwd xmm0, xmm3 + pmaddwd xmm0, [rel PW_F184_MF076] ; xmm0=tmp2 + + movdqa xmm7, xmm4 + paddd xmm4, xmm0 ; xmm4=tmp10 + psubd xmm7, xmm0 ; xmm7=tmp12 + + ; -- Final output stage + + movdqa xmm1, [rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4] + + movdqa xmm5, xmm4 + movdqa xmm3, xmm7 + paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32) + + paddd xmm4, xmm1 + paddd xmm7, xmm1 + psrad xmm4, DESCALE_P2_4 + psrad xmm7, DESCALE_P2_4 + paddd xmm5, xmm1 + paddd xmm3, xmm1 + psrad xmm5, DESCALE_P2_4 + psrad xmm3, DESCALE_P2_4 + + packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + + movdqa xmm0, xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + + movdqa xmm6, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + + packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4, [rel PB_CENTERJSAMP] + + pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 + mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 + movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + + uncollect_args 4 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 2x2 output block. +; +; GLOBAL(void) +; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = void *dct_table +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13d = JDIMENSION output_col + + align 32 + GLOBAL_FUNCTION(jsimd_idct_2x2_sse2) + +EXTN(jsimd_idct_2x2_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 4 + push rbx + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + + pcmpeqd xmm7, xmm7 + pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + + movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4, [rel PW_F362_MF127] + pmaddwd xmm5, [rel PW_F085_MF072] + + psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0, [rel PW_F362_MF127] + pmaddwd xmm2, [rel PW_F085_MF072] + + paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + + ; -- Even part + + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm6=(00 01 ** 03 ** 05 ** 07) + + movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + + ; -- Final output stage + + movdqa xmm3, xmm6 + movdqa xmm5, xmm1 + paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + + movdqa xmm2, [rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2] + + punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **) + + movdqa xmm7, xmm1 + punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7) + + paddd xmm6, xmm2 + psrad xmm6, DESCALE_P1_2 + + paddd xmm1, xmm2 + paddd xmm7, xmm2 + psrad xmm1, DESCALE_P1_2 + psrad xmm7, DESCALE_P1_2 + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1, [rel PW_F362_MF127] + pmaddwd xmm7, [rel PW_F085_MF072] + + paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + + ; -- Even part + + pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + + ; -- Final output stage + + movdqa xmm4, xmm6 + paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + + punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1) + + paddd xmm6, [rel PD_DESCALE_P2_2] + psrad xmm6, DESCALE_P2_2 + + packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6, [rel PB_CENTERJSAMP] + + pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --) + pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov word [rdx+rax*SIZEOF_JSAMPLE], bx + mov word [rsi+rax*SIZEOF_JSAMPLE], cx + + pop rbx + uncollect_args 4 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jquantf-sse2.asm b/media/libjpeg/simd/x86_64/jquantf-sse2.asm new file mode 100644 index 0000000000..83596a915b --- /dev/null +++ b/media/libjpeg/simd/x86_64/jquantf-sse2.asm @@ -0,0 +1,154 @@ +; +; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +; r10 = JSAMPARRAY sample_data +; r11d = JDIMENSION start_col +; r12 = FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_float_sse2) + +EXTN(jsimd_convsamp_float_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + push rbx + + pcmpeqw xmm7, xmm7 + psllw xmm7, 7 + packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov rsi, r10 + mov eax, r11d + mov rdi, r12 + mov rcx, DCTSIZE/2 +.convloop: + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] + + psubb xmm0, xmm7 ; xmm0=(01234567) + psubb xmm1, xmm7 ; xmm1=(89ABCDEF) + + punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + + punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F) + + psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2, xmm2 ; xmm2=(0123) + cvtdq2ps xmm0, xmm0 ; xmm0=(4567) + psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3, xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 + + add rsi, byte 2*SIZEOF_JSAMPROW + add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec rcx + jnz short .convloop + + pop rbx + uncollect_args 3 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +; r10 = JCOEFPTR coef_block +; r11 = FAST_FLOAT *divisors +; r12 = FAST_FLOAT *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_float_sse2) + +EXTN(jsimd_quantize_float_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + + mov rsi, r12 + mov rdx, r11 + mov rdi, r10 + mov rax, DCTSIZE2/16 +.quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 + + add rsi, byte 16*SIZEOF_FAST_FLOAT + add rdx, byte 16*SIZEOF_FAST_FLOAT + add rdi, byte 16*SIZEOF_JCOEF + dec rax + jnz short .quantloop + + uncollect_args 3 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jquanti-avx2.asm b/media/libjpeg/simd/x86_64/jquanti-avx2.asm new file mode 100644 index 0000000000..5f04d22330 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jquanti-avx2.asm @@ -0,0 +1,162 @@ +; +; jquanti.asm - sample data conversion and quantization (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, 2018, D. R. Commander. +; Copyright (C) 2016, Matthieu Darbois. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +; r10 = JSAMPARRAY sample_data +; r11d = JDIMENSION start_col +; r12 = DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_avx2) + +EXTN(jsimd_convsamp_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + + mov eax, r11d + + mov rsi, JSAMPROW [r10+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdi, JSAMPROW [r10+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE] + pinsrq xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1 + + mov rsi, JSAMPROW [r10+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdi, JSAMPROW [r10+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE] + pinsrq xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1 + + mov rsi, JSAMPROW [r10+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdi, JSAMPROW [r10+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE] + pinsrq xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1 + + mov rsi, JSAMPROW [r10+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdi, JSAMPROW [r10+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE] + pinsrq xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1 + + vpmovzxbw ymm0, xmm0 ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + vpmovzxbw ymm1, xmm1 ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + vpmovzxbw ymm2, xmm2 ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + vpmovzxbw ymm3, xmm3 ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + vpcmpeqw ymm7, ymm7, ymm7 + vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm1, ymm1, ymm7 + vpaddw ymm2, ymm2, ymm7 + vpaddw ymm3, ymm3, ymm7 + + vmovdqu YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0 + vmovdqu YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1 + vmovdqu YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2 + vmovdqu YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3 + + vzeroupper + uncollect_args 3 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m, n, b) \ + YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) +%define CORRECTION(m, n, b) \ + YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) +%define SCALE(m, n, b) \ + YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) + +; r10 = JCOEFPTR coef_block +; r11 = DCTELEM *divisors +; r12 = DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_avx2) + +EXTN(jsimd_quantize_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + + vmovdqu ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)] + vpabsw ymm0, ymm4 + vpabsw ymm1, ymm5 + vpabsw ymm2, ymm6 + vpabsw ymm3, ymm7 + + vpaddw ymm0, YMMWORD [CORRECTION(0,0,r11)] ; correction + roundfactor + vpaddw ymm1, YMMWORD [CORRECTION(2,0,r11)] + vpaddw ymm2, YMMWORD [CORRECTION(4,0,r11)] + vpaddw ymm3, YMMWORD [CORRECTION(6,0,r11)] + vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,r11)] ; reciprocal + vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,r11)] + vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,r11)] + vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,r11)] + vpmulhuw ymm0, YMMWORD [SCALE(0,0,r11)] ; scale + vpmulhuw ymm1, YMMWORD [SCALE(2,0,r11)] + vpmulhuw ymm2, YMMWORD [SCALE(4,0,r11)] + vpmulhuw ymm3, YMMWORD [SCALE(6,0,r11)] + + vpsignw ymm0, ymm0, ymm4 + vpsignw ymm1, ymm1, ymm5 + vpsignw ymm2, ymm2, ymm6 + vpsignw ymm3, ymm3, ymm7 + + vmovdqu [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0 + vmovdqu [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1 + vmovdqu [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2 + vmovdqu [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3 + + vzeroupper + uncollect_args 3 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jquanti-sse2.asm b/media/libjpeg/simd/x86_64/jquanti-sse2.asm new file mode 100644 index 0000000000..bb6fa69ea3 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jquanti-sse2.asm @@ -0,0 +1,187 @@ +; +; jquanti.asm - sample data conversion and quantization (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +; r10 = JSAMPARRAY sample_data +; r11d = JDIMENSION start_col +; r12 = DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_convsamp_sse2) + +EXTN(jsimd_convsamp_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + push rbx + + pxor xmm6, xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7, xmm7 + psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov rsi, r10 + mov eax, r11d + mov rdi, r12 + mov rcx, DCTSIZE/4 +.convloop: + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) + movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) + movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0, xmm6 ; xmm0=(01234567) + punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) + paddw xmm0, xmm7 + paddw xmm1, xmm7 + punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2, xmm7 + paddw xmm3, xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 + + add rsi, byte 4*SIZEOF_JSAMPROW + add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec rcx + jnz short .convloop + + pop rbx + uncollect_args 3 + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m, n, b) \ + XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) +%define CORRECTION(m, n, b) \ + XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) +%define SCALE(m, n, b) \ + XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) + +; r10 = JCOEFPTR coef_block +; r11 = DCTELEM *divisors +; r12 = DCTELEM *workspace + + align 32 + GLOBAL_FUNCTION(jsimd_quantize_sse2) + +EXTN(jsimd_quantize_sse2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + + mov rsi, r12 + mov rdx, r11 + mov rdi, r10 + mov rax, DCTSIZE2/32 +.quantloop: + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + movdqa xmm2, xmm6 + movdqa xmm3, xmm7 + psraw xmm4, (WORD_BIT-1) + psraw xmm5, (WORD_BIT-1) + psraw xmm6, (WORD_BIT-1) + psraw xmm7, (WORD_BIT-1) + pxor xmm0, xmm4 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] + pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] + + pxor xmm0, xmm4 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + psubw xmm0, xmm4 + psubw xmm1, xmm5 + psubw xmm2, xmm6 + psubw xmm3, xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 + + add rsi, byte 32*SIZEOF_DCTELEM + add rdx, byte 32*SIZEOF_DCTELEM + add rdi, byte 32*SIZEOF_JCOEF + dec rax + jnz near .quantloop + + uncollect_args 3 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/media/libjpeg/simd/x86_64/jsimd.c b/media/libjpeg/simd/x86_64/jsimd.c new file mode 100644 index 0000000000..1e5698b3a4 --- /dev/null +++ b/media/libjpeg/simd/x86_64/jsimd.c @@ -0,0 +1,1076 @@ +/* + * jsimd_x86_64.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 64-bit x86 architecture. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" +#include "jconfigint.h" + +/* + * In the PIC cases, we have no guarantee that constants will keep + * their alignment. This macro allows us to verify it at runtime. + */ +#define IS_ALIGNED(ptr, order) (((size_t)ptr & ((1 << order) - 1)) == 0) + +#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ +#define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */ + +static unsigned int simd_support = (unsigned int)(~0); +static unsigned int simd_huffman = 1; + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd(void) +{ +#ifndef NO_GETENV + char *env = NULL; +#endif + + if (simd_support != ~0U) + return; + + simd_support = jpeg_simd_cpu_support(); + +#ifndef NO_GETENV + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCESSE2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_SSE2; + env = getenv("JSIMD_FORCEAVX2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_AVX2; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; + env = getenv("JSIMD_NOHUFFENC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_huffman = 0; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565(void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_extrgb_ycc_convert_avx2; + sse2fct = jsimd_extrgb_ycc_convert_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_extrgbx_ycc_convert_avx2; + sse2fct = jsimd_extrgbx_ycc_convert_sse2; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_extbgr_ycc_convert_avx2; + sse2fct = jsimd_extbgr_ycc_convert_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_extbgrx_ycc_convert_avx2; + sse2fct = jsimd_extbgrx_ycc_convert_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_extxbgr_ycc_convert_avx2; + sse2fct = jsimd_extxbgr_ycc_convert_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_extxrgb_ycc_convert_avx2; + sse2fct = jsimd_extxrgb_ycc_convert_sse2; + break; + default: + avx2fct = jsimd_rgb_ycc_convert_avx2; + sse2fct = jsimd_rgb_ycc_convert_sse2; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows) +{ + void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_extrgb_gray_convert_avx2; + sse2fct = jsimd_extrgb_gray_convert_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_extrgbx_gray_convert_avx2; + sse2fct = jsimd_extrgbx_gray_convert_sse2; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_extbgr_gray_convert_avx2; + sse2fct = jsimd_extbgr_gray_convert_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_extbgrx_gray_convert_avx2; + sse2fct = jsimd_extbgrx_gray_convert_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_extxbgr_gray_convert_avx2; + sse2fct = jsimd_extxbgr_gray_convert_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_extxrgb_gray_convert_avx2; + sse2fct = jsimd_extxrgb_gray_convert_sse2; + break; + default: + avx2fct = jsimd_rgb_gray_convert_avx2; + sse2fct = jsimd_rgb_gray_convert_sse2; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ + void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_ycc_extrgb_convert_avx2; + sse2fct = jsimd_ycc_extrgb_convert_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_ycc_extrgbx_convert_avx2; + sse2fct = jsimd_ycc_extrgbx_convert_sse2; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_ycc_extbgr_convert_avx2; + sse2fct = jsimd_ycc_extbgr_convert_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_ycc_extbgrx_convert_avx2; + sse2fct = jsimd_ycc_extbgrx_convert_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_ycc_extxbgr_convert_avx2; + sse2fct = jsimd_ycc_extxbgr_convert_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_ycc_extxrgb_convert_avx2; + sse2fct = jsimd_ycc_extxrgb_convert_sse2; + break; + default: + avx2fct = jsimd_ycc_rgb_convert_avx2; + sse2fct = jsimd_ycc_rgb_convert_sse2; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); + else + sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else + jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else + jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else + jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else + jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_fancy_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_fancy_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else + jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_AVX2) + jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else + jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_merged_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_merged_upsample_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2; + break; + default: + avx2fct = jsimd_h2v2_merged_upsample_avx2; + sse2fct = jsimd_h2v2_merged_upsample_sse2; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) +{ + void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2; + break; + case JCS_EXT_BGR: + avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2; + sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2; + break; + default: + avx2fct = jsimd_h2v1_merged_upsample_avx2; + sse2fct = jsimd_h2v1_merged_upsample_sse2; + break; + } + + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + if (simd_support & JSIMD_AVX2) + jsimd_convsamp_avx2(sample_data, start_col, workspace); + else + jsimd_convsamp_sse2(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ + jsimd_convsamp_float_sse2(sample_data, start_col, workspace); +} + +GLOBAL(int) +jsimd_can_fdct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow(DCTELEM *data) +{ + if (simd_support & JSIMD_AVX2) + jsimd_fdct_islow_avx2(data); + else + jsimd_fdct_islow_sse2(data); +} + +GLOBAL(void) +jsimd_fdct_ifast(DCTELEM *data) +{ + jsimd_fdct_ifast_sse2(data); +} + +GLOBAL(void) +jsimd_fdct_float(FAST_FLOAT *data) +{ + jsimd_fdct_float_sse(data); +} + +GLOBAL(int) +jsimd_can_quantize(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_AVX2) + return 1; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + if (simd_support & JSIMD_AVX2) + jsimd_quantize_avx2(coef_block, divisors, workspace); + else + jsimd_quantize_sse2(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ + jsimd_quantize_float_sse2(coef_block, divisors, workspace); +} + +GLOBAL(int) +jsimd_can_idct_2x2(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2)) + return 1; + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + if (sizeof(FLOAT_MULT_TYPE) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_AVX2) + jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf, + output_col); + else + jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && simd_huffman && + IS_ALIGNED_SSE(jconst_huff_encode_one_block)) + return 1; + + return 0; +} + +GLOBAL(JOCTET *) +jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val, + dctbl, actbl); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_first_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (SIZEOF_SIZE_T != 8) + return 0; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *values, size_t *zerobits) +{ + jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start, + Sl, Al, values, zerobits); +} + +GLOBAL(int) +jsimd_can_encode_mcu_AC_refine_prepare(void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (SIZEOF_SIZE_T != 8) + return 0; + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, + const int *jpeg_natural_order_start, int Sl, + int Al, JCOEF *absvalues, size_t *bits) +{ + return jsimd_encode_mcu_AC_refine_prepare_sse2(block, + jpeg_natural_order_start, + Sl, Al, absvalues, bits); +} diff --git a/media/libjpeg/simd/x86_64/jsimdcpu.asm b/media/libjpeg/simd/x86_64/jsimdcpu.asm new file mode 100644 index 0000000000..705f813d7d --- /dev/null +++ b/media/libjpeg/simd/x86_64/jsimdcpu.asm @@ -0,0 +1,86 @@ +; +; jsimdcpu.asm - SIMD instruction support check +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Check if the CPU supports SIMD instructions +; +; GLOBAL(unsigned int) +; jpeg_simd_cpu_support(void) +; + + align 32 + GLOBAL_FUNCTION(jpeg_simd_cpu_support) + +EXTN(jpeg_simd_cpu_support): + push rbx + push rdi + + xor rdi, rdi ; simd support flag + + ; Assume that all x86-64 processors support SSE & SSE2 instructions + or rdi, JSIMD_SSE2 + or rdi, JSIMD_SSE + + ; Check whether CPUID leaf 07H is supported + ; (leaf 07H is used to check for AVX2 instruction support) + mov rax, 0 + cpuid + cmp rax, 7 + jl short .return ; Maximum leaf < 07H + + ; Check for AVX2 instruction support + mov rax, 7 + xor rcx, rcx + cpuid + mov rax, rbx ; rax = Extended feature flags + + test rax, 1<<5 ; bit5:AVX2 + jz short .return + + ; Check for AVX2 O/S support + mov rax, 1 + xor rcx, rcx + cpuid + test rcx, 1<<27 + jz short .return ; O/S does not support XSAVE + test rcx, 1<<28 + jz short .return ; CPU does not support AVX2 + + xor rcx, rcx + xgetbv + and rax, 6 + cmp rax, 6 ; O/S does not manage XMM/YMM state + ; using XSAVE + jnz short .return + + or rdi, JSIMD_AVX2 + +.return: + mov rax, rdi + + pop rdi + pop rbx + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 |