; ; jcphuff-sse2.asm - prepare data for progressive Huffman encoding ; (64-bit SSE2) ; ; Copyright (C) 2016, 2018, Matthieu Darbois ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; ; This file should be assembled with NASM (Netwide Assembler), ; can *not* be assembled with Microsoft's MASM or any compatible ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 ; ; This file contains an SSE2 implementation of data preparation for progressive ; Huffman encoding. See jcphuff.c for more details. %include "jsimdext.inc" ; -------------------------------------------------------------------------- SECTION SEG_TEXT BITS 64 ; -------------------------------------------------------------------------- ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and ; jsimd_encode_mcu_AC_refine_prepare_sse2() %macro LOAD16 0 pxor N0, N0 pxor N1, N1 mov T0d, INT [LUT + 0*SIZEOF_INT] mov T1d, INT [LUT + 8*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 0 pinsrw X1, word [BLOCK + T1 * 2], 0 mov T0d, INT [LUT + 1*SIZEOF_INT] mov T1d, INT [LUT + 9*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 1 pinsrw X1, word [BLOCK + T1 * 2], 1 mov T0d, INT [LUT + 2*SIZEOF_INT] mov T1d, INT [LUT + 10*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 2 pinsrw X1, word [BLOCK + T1 * 2], 2 mov T0d, INT [LUT + 3*SIZEOF_INT] mov T1d, INT [LUT + 11*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 3 pinsrw X1, word [BLOCK + T1 * 2], 3 mov T0d, INT [LUT + 4*SIZEOF_INT] mov T1d, INT [LUT + 12*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 4 pinsrw X1, word [BLOCK + T1 * 2], 4 mov T0d, INT [LUT + 5*SIZEOF_INT] mov T1d, INT [LUT + 13*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 5 pinsrw X1, word [BLOCK + T1 * 2], 5 mov T0d, INT [LUT + 6*SIZEOF_INT] mov T1d, INT [LUT + 14*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 6 pinsrw X1, word [BLOCK + T1 * 2], 6 mov T0d, INT [LUT + 7*SIZEOF_INT] mov T1d, INT [LUT + 15*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 7 pinsrw X1, word [BLOCK + T1 * 2], 7 %endmacro %macro LOAD15 0 pxor N0, N0 pxor N1, N1 pxor X1, X1 mov T0d, INT [LUT + 0*SIZEOF_INT] mov T1d, INT [LUT + 8*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 0 pinsrw X1, word [BLOCK + T1 * 2], 0 mov T0d, INT [LUT + 1*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 1 mov T0d, INT [LUT + 2*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 2 mov T0d, INT [LUT + 3*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 3 mov T0d, INT [LUT + 4*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 4 mov T0d, INT [LUT + 5*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 5 mov T0d, INT [LUT + 6*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 6 mov T0d, INT [LUT + 7*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 7 cmp LENEND, 2 jl %%.ELOAD15 mov T1d, INT [LUT + 9*SIZEOF_INT] pinsrw X1, word [BLOCK + T1 * 2], 1 cmp LENEND, 3 jl %%.ELOAD15 mov T1d, INT [LUT + 10*SIZEOF_INT] pinsrw X1, word [BLOCK + T1 * 2], 2 cmp LENEND, 4 jl %%.ELOAD15 mov T1d, INT [LUT + 11*SIZEOF_INT] pinsrw X1, word [BLOCK + T1 * 2], 3 cmp LENEND, 5 jl %%.ELOAD15 mov T1d, INT [LUT + 12*SIZEOF_INT] pinsrw X1, word [BLOCK + T1 * 2], 4 cmp LENEND, 6 jl %%.ELOAD15 mov T1d, INT [LUT + 13*SIZEOF_INT] pinsrw X1, word [BLOCK + T1 * 2], 5 cmp LENEND, 7 jl %%.ELOAD15 mov T1d, INT [LUT + 14*SIZEOF_INT] pinsrw X1, word [BLOCK + T1 * 2], 6 %%.ELOAD15: %endmacro %macro LOAD8 0 pxor N0, N0 mov T0d, INT [LUT + 0*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 0 mov T0d, INT [LUT + 1*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 1 mov T0d, INT [LUT + 2*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 2 mov T0d, INT [LUT + 3*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 3 mov T0d, INT [LUT + 4*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 4 mov T0d, INT [LUT + 5*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 5 mov T0d, INT [LUT + 6*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 6 mov T0d, INT [LUT + 7*SIZEOF_INT] pinsrw X0, word [BLOCK + T0 * 2], 7 %endmacro %macro LOAD7 0 pxor N0, N0 pxor X0, X0 mov T1d, INT [LUT + 0*SIZEOF_INT] pinsrw X0, word [BLOCK + T1 * 2], 0 cmp LENEND, 2 jl %%.ELOAD7 mov T1d, INT [LUT + 1*SIZEOF_INT] pinsrw X0, word [BLOCK + T1 * 2], 1 cmp LENEND, 3 jl %%.ELOAD7 mov T1d, INT [LUT + 2*SIZEOF_INT] pinsrw X0, word [BLOCK + T1 * 2], 2 cmp LENEND, 4 jl %%.ELOAD7 mov T1d, INT [LUT + 3*SIZEOF_INT] pinsrw X0, word [BLOCK + T1 * 2], 3 cmp LENEND, 5 jl %%.ELOAD7 mov T1d, INT [LUT + 4*SIZEOF_INT] pinsrw X0, word [BLOCK + T1 * 2], 4 cmp LENEND, 6 jl %%.ELOAD7 mov T1d, INT [LUT + 5*SIZEOF_INT] pinsrw X0, word [BLOCK + T1 * 2], 5 cmp LENEND, 7 jl %%.ELOAD7 mov T1d, INT [LUT + 6*SIZEOF_INT] pinsrw X0, word [BLOCK + T1 * 2], 6 %%.ELOAD7: %endmacro %macro REDUCE0 0 movdqa xmm0, XMMWORD [VALUES + ( 0*2)] movdqa xmm1, XMMWORD [VALUES + ( 8*2)] movdqa xmm2, XMMWORD [VALUES + (16*2)] movdqa xmm3, XMMWORD [VALUES + (24*2)] movdqa xmm4, XMMWORD [VALUES + (32*2)] movdqa xmm5, XMMWORD [VALUES + (40*2)] movdqa xmm6, XMMWORD [VALUES + (48*2)] movdqa xmm7, XMMWORD [VALUES + (56*2)] pcmpeqw xmm0, ZERO pcmpeqw xmm1, ZERO pcmpeqw xmm2, ZERO pcmpeqw xmm3, ZERO pcmpeqw xmm4, ZERO pcmpeqw xmm5, ZERO pcmpeqw xmm6, ZERO pcmpeqw xmm7, ZERO packsswb xmm0, xmm1 packsswb xmm2, xmm3 packsswb xmm4, xmm5 packsswb xmm6, xmm7 pmovmskb eax, xmm0 pmovmskb ecx, xmm2 pmovmskb edx, xmm4 pmovmskb esi, xmm6 shl rcx, 16 shl rdx, 32 shl rsi, 48 or rax, rcx or rdx, rsi or rax, rdx not rax mov MMWORD [r15], rax %endmacro ; ; Prepare data for jsimd_encode_mcu_AC_first(). ; ; GLOBAL(void) ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, ; const int *jpeg_natural_order_start, ; int Sl, int Al, JCOEF *values, ; size_t *zerobits) ; ; r10 = const JCOEF *block ; r11 = const int *jpeg_natural_order_start ; r12 = int Sl ; r13 = int Al ; r14 = JCOEF *values ; r15 = size_t *zerobits %define ZERO xmm9 %define X0 xmm0 %define X1 xmm1 %define N0 xmm2 %define N1 xmm3 %define AL xmm4 %define K eax %define LUT r11 %define T0 rcx %define T0d ecx %define T1 rdx %define T1d edx %define BLOCK r10 %define VALUES r14 %define LEN r12d %define LENEND r13d align 32 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): push rbp mov rax, rsp ; rax = original rbp sub rsp, byte 4 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [rbp - 16] collect_args 6 movdqa XMMWORD [rbp - 16], ZERO movd AL, r13d pxor ZERO, ZERO mov K, LEN mov LENEND, LEN and K, -16 and LENEND, 7 shr K, 4 jz .ELOOP16 .BLOOP16: LOAD16 pcmpgtw N0, X0 pcmpgtw N1, X1 paddw X0, N0 paddw X1, N1 pxor X0, N0 pxor X1, N1 psrlw X0, AL psrlw X1, AL pxor N0, X0 pxor N1, X1 movdqa XMMWORD [VALUES + (0) * 2], X0 movdqa XMMWORD [VALUES + (8) * 2], X1 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 add VALUES, 16*2 add LUT, 16*SIZEOF_INT dec K jnz .BLOOP16 test LEN, 15 je .PADDING .ELOOP16: test LEN, 8 jz .TRY7 test LEN, 7 jz .TRY8 LOAD15 pcmpgtw N0, X0 pcmpgtw N1, X1 paddw X0, N0 paddw X1, N1 pxor X0, N0 pxor X1, N1 psrlw X0, AL psrlw X1, AL pxor N0, X0 pxor N1, X1 movdqa XMMWORD [VALUES + (0) * 2], X0 movdqa XMMWORD [VALUES + (8) * 2], X1 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 add VALUES, 16*2 jmp .PADDING .TRY8: LOAD8 pcmpgtw N0, X0 paddw X0, N0 pxor X0, N0 psrlw X0, AL pxor N0, X0 movdqa XMMWORD [VALUES + (0) * 2], X0 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 add VALUES, 8*2 jmp .PADDING .TRY7: LOAD7 pcmpgtw N0, X0 paddw X0, N0 pxor X0, N0 psrlw X0, AL pxor N0, X0 movdqa XMMWORD [VALUES + (0) * 2], X0 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 add VALUES, 8*2 .PADDING: mov K, LEN add K, 7 and K, -8 shr K, 3 sub K, DCTSIZE2/8 jz .EPADDING align 16 .ZEROLOOP: movdqa XMMWORD [VALUES + 0], ZERO add VALUES, 8*2 inc K jnz .ZEROLOOP .EPADDING: sub VALUES, DCTSIZE2*2 REDUCE0 movdqa ZERO, XMMWORD [rbp - 16] uncollect_args 6 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp ret %undef ZERO %undef X0 %undef X1 %undef N0 %undef N1 %undef AL %undef K %undef LUT %undef T0 %undef T0d %undef T1 %undef T1d %undef BLOCK %undef VALUES %undef LEN %undef LENEND ; ; Prepare data for jsimd_encode_mcu_AC_refine(). ; ; GLOBAL(int) ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, ; const int *jpeg_natural_order_start, ; int Sl, int Al, JCOEF *absvalues, ; size_t *bits) ; ; r10 = const JCOEF *block ; r11 = const int *jpeg_natural_order_start ; r12 = int Sl ; r13 = int Al ; r14 = JCOEF *values ; r15 = size_t *bits %define ZERO xmm9 %define ONE xmm5 %define X0 xmm0 %define X1 xmm1 %define N0 xmm2 %define N1 xmm3 %define AL xmm4 %define K eax %define KK r9d %define EOB r8d %define SIGN rdi %define LUT r11 %define T0 rcx %define T0d ecx %define T1 rdx %define T1d edx %define BLOCK r10 %define VALUES r14 %define LEN r12d %define LENEND r13d align 32 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): push rbp mov rax, rsp ; rax = original rbp sub rsp, byte 4 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [rbp - 16] collect_args 6 movdqa XMMWORD [rbp - 16], ZERO xor SIGN, SIGN xor EOB, EOB xor KK, KK movd AL, r13d pxor ZERO, ZERO pcmpeqw ONE, ONE psrlw ONE, 15 mov K, LEN mov LENEND, LEN and K, -16 and LENEND, 7 shr K, 4 jz .ELOOPR16 .BLOOPR16: LOAD16 pcmpgtw N0, X0 pcmpgtw N1, X1 paddw X0, N0 paddw X1, N1 pxor X0, N0 pxor X1, N1 psrlw X0, AL psrlw X1, AL movdqa XMMWORD [VALUES + (0) * 2], X0 movdqa XMMWORD [VALUES + (8) * 2], X1 pcmpeqw X0, ONE pcmpeqw X1, ONE packsswb N0, N1 packsswb X0, X1 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); shr SIGN, 16 ; make room for sizebits shl T0, 48 or SIGN, T0 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); jz .CONTINUER16 ; if (idx) { mov EOB, KK add EOB, T1d ; EOB = k + idx; .CONTINUER16: add VALUES, 16*2 add LUT, 16*SIZEOF_INT add KK, 16 dec K jnz .BLOOPR16 test LEN, 15 je .PADDINGR .ELOOPR16: test LEN, 8 jz .TRYR7 test LEN, 7 jz .TRYR8 LOAD15 pcmpgtw N0, X0 pcmpgtw N1, X1 paddw X0, N0 paddw X1, N1 pxor X0, N0 pxor X1, N1 psrlw X0, AL psrlw X1, AL movdqa XMMWORD [VALUES + (0) * 2], X0 movdqa XMMWORD [VALUES + (8) * 2], X1 pcmpeqw X0, ONE pcmpeqw X1, ONE packsswb N0, N1 packsswb X0, X1 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); shr SIGN, 16 ; make room for sizebits shl T0, 48 or SIGN, T0 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); jz .CONTINUER15 ; if (idx) { mov EOB, KK add EOB, T1d ; EOB = k + idx; .CONTINUER15: add VALUES, 16*2 jmp .PADDINGR .TRYR8: LOAD8 pcmpgtw N0, X0 paddw X0, N0 pxor X0, N0 psrlw X0, AL movdqa XMMWORD [VALUES + (0) * 2], X0 pcmpeqw X0, ONE packsswb N0, ZERO packsswb X0, ZERO pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); shr SIGN, 8 ; make room for sizebits shl T0, 56 or SIGN, T0 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); jz .CONTINUER8 ; if (idx) { mov EOB, KK add EOB, T1d ; EOB = k + idx; .CONTINUER8: add VALUES, 8*2 jmp .PADDINGR .TRYR7: LOAD7 pcmpgtw N0, X0 paddw X0, N0 pxor X0, N0 psrlw X0, AL movdqa XMMWORD [VALUES + (0) * 2], X0 pcmpeqw X0, ONE packsswb N0, ZERO packsswb X0, ZERO pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); shr SIGN, 8 ; make room for sizebits shl T0, 56 or SIGN, T0 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); jz .CONTINUER7 ; if (idx) { mov EOB, KK add EOB, T1d ; EOB = k + idx; .CONTINUER7: add VALUES, 8*2 .PADDINGR: mov K, LEN add K, 7 and K, -8 shr K, 3 sub K, DCTSIZE2/8 jz .EPADDINGR align 16 .ZEROLOOPR: movdqa XMMWORD [VALUES + 0], ZERO shr SIGN, 8 add VALUES, 8*2 inc K jnz .ZEROLOOPR .EPADDINGR: not SIGN sub VALUES, DCTSIZE2*2 mov MMWORD [r15+SIZEOF_MMWORD], SIGN REDUCE0 mov eax, EOB movdqa ZERO, XMMWORD [rbp - 16] uncollect_args 6 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp ret %undef ZERO %undef ONE %undef X0 %undef X1 %undef N0 %undef N1 %undef AL %undef K %undef KK %undef EOB %undef SIGN %undef LUT %undef T0 %undef T0d %undef T1 %undef T1d %undef BLOCK %undef VALUES %undef LEN %undef LENEND ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. align 32