diff options
Diffstat (limited to 'media/libjpeg/simd/nasm')
-rw-r--r-- | media/libjpeg/simd/nasm/jcolsamp.inc | 135 | ||||
-rw-r--r-- | media/libjpeg/simd/nasm/jdct.inc | 31 | ||||
-rw-r--r-- | media/libjpeg/simd/nasm/jsimdcfg.inc | 93 | ||||
-rw-r--r-- | media/libjpeg/simd/nasm/jsimdcfg.inc.h | 133 | ||||
-rw-r--r-- | media/libjpeg/simd/nasm/jsimdext.inc | 520 |
5 files changed, 912 insertions, 0 deletions
diff --git a/media/libjpeg/simd/nasm/jcolsamp.inc b/media/libjpeg/simd/nasm/jcolsamp.inc new file mode 100644 index 0000000000..6f6d7f29d1 --- /dev/null +++ b/media/libjpeg/simd/nasm/jcolsamp.inc @@ -0,0 +1,135 @@ +; +; jcolsamp.inc - private declarations for color conversion & up/downsampling +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc + +; -------------------------------------------------------------------------- + +; pseudo-resisters to make ordering of RGB configurable +; +%if RGB_RED == 0 +%define mmA mm0 +%define mmB mm1 +%define xmmA xmm0 +%define xmmB xmm1 +%define ymmA ymm0 +%define ymmB ymm1 +%elif RGB_GREEN == 0 +%define mmA mm2 +%define mmB mm3 +%define xmmA xmm2 +%define xmmB xmm3 +%define ymmA ymm2 +%define ymmB ymm3 +%elif RGB_BLUE == 0 +%define mmA mm4 +%define mmB mm5 +%define xmmA xmm4 +%define xmmB xmm5 +%define ymmA ymm4 +%define ymmB ymm5 +%else +%define mmA mm6 +%define mmB mm7 +%define xmmA xmm6 +%define xmmB xmm7 +%define ymmA ymm6 +%define ymmB ymm7 +%endif + +%if RGB_RED == 1 +%define mmC mm0 +%define mmD mm1 +%define xmmC xmm0 +%define xmmD xmm1 +%define ymmC ymm0 +%define ymmD ymm1 +%elif RGB_GREEN == 1 +%define mmC mm2 +%define mmD mm3 +%define xmmC xmm2 +%define xmmD xmm3 +%define ymmC ymm2 +%define ymmD ymm3 +%elif RGB_BLUE == 1 +%define mmC mm4 +%define mmD mm5 +%define xmmC xmm4 +%define xmmD xmm5 +%define ymmC ymm4 +%define ymmD ymm5 +%else +%define mmC mm6 +%define mmD mm7 +%define xmmC xmm6 +%define xmmD xmm7 +%define ymmC ymm6 +%define ymmD ymm7 +%endif + +%if RGB_RED == 2 +%define mmE mm0 +%define mmF mm1 +%define xmmE xmm0 +%define xmmF xmm1 +%define ymmE ymm0 +%define ymmF ymm1 +%elif RGB_GREEN == 2 +%define mmE mm2 +%define mmF mm3 +%define xmmE xmm2 +%define xmmF xmm3 +%define ymmE ymm2 +%define ymmF ymm3 +%elif RGB_BLUE == 2 +%define mmE mm4 +%define mmF mm5 +%define xmmE xmm4 +%define xmmF xmm5 +%define ymmE ymm4 +%define ymmF ymm5 +%else +%define mmE mm6 +%define mmF mm7 +%define xmmE xmm6 +%define xmmF xmm7 +%define ymmE ymm6 +%define ymmF ymm7 +%endif + +%if RGB_RED == 3 +%define mmG mm0 +%define mmH mm1 +%define xmmG xmm0 +%define xmmH xmm1 +%define ymmG ymm0 +%define ymmH ymm1 +%elif RGB_GREEN == 3 +%define mmG mm2 +%define mmH mm3 +%define xmmG xmm2 +%define xmmH xmm3 +%define ymmG ymm2 +%define ymmH ymm3 +%elif RGB_BLUE == 3 +%define mmG mm4 +%define mmH mm5 +%define xmmG xmm4 +%define xmmH xmm5 +%define ymmG ymm4 +%define ymmH ymm5 +%else +%define mmG mm6 +%define mmH mm7 +%define xmmG xmm6 +%define xmmH xmm7 +%define ymmG ymm6 +%define ymmH ymm7 +%endif + +; -------------------------------------------------------------------------- diff --git a/media/libjpeg/simd/nasm/jdct.inc b/media/libjpeg/simd/nasm/jdct.inc new file mode 100644 index 0000000000..9192f66f0c --- /dev/null +++ b/media/libjpeg/simd/nasm/jdct.inc @@ -0,0 +1,31 @@ +; +; jdct.inc - private declarations for forward & reverse DCT subsystems +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2018, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc + +; Each IDCT routine is responsible for range-limiting its results and +; converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could +; be quite far out of range if the input data is corrupt, so a bulletproof +; range-limiting step is required. We use a mask-and-table-lookup method +; to do the combined operations quickly. +; +%define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples + +%define ROW(n, b, s) ((b) + (n) * (s)) +%define COL(n, b, s) ((b) + (n) * (s) * DCTSIZE) + +%define DWBLOCK(m, n, b, s) \ + ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_DWORD) +%define MMBLOCK(m, n, b, s) \ + ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_MMWORD) +%define XMMBLOCK(m, n, b, s) \ + ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_XMMWORD) +%define YMMBLOCK(m, n, b, s) \ + ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_YMMWORD) + +; -------------------------------------------------------------------------- diff --git a/media/libjpeg/simd/nasm/jsimdcfg.inc b/media/libjpeg/simd/nasm/jsimdcfg.inc new file mode 100644 index 0000000000..667024a5f9 --- /dev/null +++ b/media/libjpeg/simd/nasm/jsimdcfg.inc @@ -0,0 +1,93 @@ +; +; Automatically generated include file from jsimdcfg.inc.h +; +; +; -- jpeglib.h +; +%define DCTSIZE 8 +%define DCTSIZE2 64 +; +; -- jmorecfg.h +; +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 3 +%define EXT_RGB_RED 0 +%define EXT_RGB_GREEN 1 +%define EXT_RGB_BLUE 2 +%define EXT_RGB_PIXELSIZE 3 +%define EXT_RGBX_RED 0 +%define EXT_RGBX_GREEN 1 +%define EXT_RGBX_BLUE 2 +%define EXT_RGBX_PIXELSIZE 4 +%define EXT_BGR_RED 2 +%define EXT_BGR_GREEN 1 +%define EXT_BGR_BLUE 0 +%define EXT_BGR_PIXELSIZE 3 +%define EXT_BGRX_RED 2 +%define EXT_BGRX_GREEN 1 +%define EXT_BGRX_BLUE 0 +%define EXT_BGRX_PIXELSIZE 4 +%define EXT_XBGR_RED 3 +%define EXT_XBGR_GREEN 2 +%define EXT_XBGR_BLUE 1 +%define EXT_XBGR_PIXELSIZE 4 +%define EXT_XRGB_RED 1 +%define EXT_XRGB_GREEN 2 +%define EXT_XRGB_BLUE 3 +%define EXT_XRGB_PIXELSIZE 4 +%define RGBX_FILLER_0XFF 1 +; Representation of a single sample (pixel element value). +; On this SIMD implementation, this must be 'unsigned char'. +; +%define JSAMPLE byte ; unsigned char +%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) +%define CENTERJSAMPLE 128 +; Representation of a DCT frequency coefficient. +; On this SIMD implementation, this must be 'short'. +; +%define JCOEF word ; short +%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) +; Datatype used for image dimensions. +; On this SIMD implementation, this must be 'unsigned int'. +; +%define JDIMENSION dword ; unsigned int +%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) +%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h) +%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) +%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) +%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h) +%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) +%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) +%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) +%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) +; +; -- jdct.h +; +; A forward DCT routine is given a pointer to a work area of type DCTELEM[]; +; the DCT is to be performed in-place in that buffer. +; To maximize parallelism, Type DCTELEM is changed to short (originally, int). +; +%define DCTELEM word ; short +%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) +%define float FP32 ; float +%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float) +; To maximize parallelism, Type short is changed to short. +; +%define ISLOW_MULT_TYPE word ; must be short +%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) +%define IFAST_MULT_TYPE word ; must be short +%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) +%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors +%define FLOAT_MULT_TYPE FP32 ; must be float +%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) +; +; -- jsimd.h +; +%define JSIMD_NONE 0x00 +%define JSIMD_MMX 0x01 +%define JSIMD_3DNOW 0x02 +%define JSIMD_SSE 0x04 +%define JSIMD_SSE2 0x08 +%define JSIMD_AVX2 0x80 diff --git a/media/libjpeg/simd/nasm/jsimdcfg.inc.h b/media/libjpeg/simd/nasm/jsimdcfg.inc.h new file mode 100644 index 0000000000..bf2a45ad50 --- /dev/null +++ b/media/libjpeg/simd/nasm/jsimdcfg.inc.h @@ -0,0 +1,133 @@ +/* + * This file generates the include file for the assembly + * implementations by abusing the C preprocessor. + * + * Note: Some things are manually defined as they need to + * be mapped to NASM types. + */ + +; +; Automatically generated include file from jsimdcfg.inc.h +; + +#define JPEG_INTERNALS + +#include "../jpeglib.h" +#include "../jconfig.h" +#include "../jmorecfg.h" +#include "jsimd.h" + +; +; -- jpeglib.h +; + +%define _cpp_protection_DCTSIZE DCTSIZE +%define _cpp_protection_DCTSIZE2 DCTSIZE2 + +; +; -- jmorecfg.h +; + +%define _cpp_protection_RGB_RED RGB_RED +%define _cpp_protection_RGB_GREEN RGB_GREEN +%define _cpp_protection_RGB_BLUE RGB_BLUE +%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE + +%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED +%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN +%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE +%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE + +%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED +%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN +%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE +%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE + +%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED +%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN +%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE +%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE + +%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED +%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN +%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE +%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE + +%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED +%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN +%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE +%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE + +%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED +%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN +%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE +%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE + +%define RGBX_FILLER_0XFF 1 + +; Representation of a single sample (pixel element value). +; On this SIMD implementation, this must be 'unsigned char'. +; + +%define JSAMPLE byte ; unsigned char +%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) + +%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE + +; Representation of a DCT frequency coefficient. +; On this SIMD implementation, this must be 'short'. +; +%define JCOEF word ; short +%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) + +; Datatype used for image dimensions. +; On this SIMD implementation, this must be 'unsigned int'. +; +%define JDIMENSION dword ; unsigned int +%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) + +%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h) +%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) +%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) +%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h) +%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) +%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) +%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) +%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) + +; +; -- jdct.h +; + +; A forward DCT routine is given a pointer to a work area of type DCTELEM[]; +; the DCT is to be performed in-place in that buffer. +; To maximize parallelism, Type DCTELEM is changed to short (originally, int). +; +%define DCTELEM word ; short +%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) + +%define FAST_FLOAT FP32 ; float +%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT) + +; To maximize parallelism, Type MULTIPLIER is changed to short. +; +%define ISLOW_MULT_TYPE word ; must be short +%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) + +%define IFAST_MULT_TYPE word ; must be short +%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) +%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors + +%define FLOAT_MULT_TYPE FP32 ; must be float +%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) + +; +; -- jsimd.h +; + +%define _cpp_protection_JSIMD_NONE JSIMD_NONE +%define _cpp_protection_JSIMD_MMX JSIMD_MMX +%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW +%define _cpp_protection_JSIMD_SSE JSIMD_SSE +%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2 +%define _cpp_protection_JSIMD_AVX2 JSIMD_AVX2 diff --git a/media/libjpeg/simd/nasm/jsimdext.inc b/media/libjpeg/simd/nasm/jsimdext.inc new file mode 100644 index 0000000000..e8d50b0349 --- /dev/null +++ b/media/libjpeg/simd/nasm/jsimdext.inc @@ -0,0 +1,520 @@ +; +; jsimdext.inc - common declarations +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander. +; Copyright (C) 2018, Matthieu Darbois. +; Copyright (C) 2018, Matthias Räncker. +; +; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 +; +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. + +; ========================================================================== +; System-dependent configurations + +%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- +; * Microsoft Visual C++ +; * MinGW (Minimalist GNU for Windows) +; * CygWin +; * LCC-Win32 + +; -- segment definition -- +; +%ifdef __YASM_VER__ +%define SEG_TEXT .text align=32 +%define SEG_CONST .rdata align=32 +%else +%define SEG_TEXT .text align=32 public use32 class=CODE +%define SEG_CONST .rdata align=32 public use32 class=CONST +%endif + +%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- +; * Microsoft Visual C++ + +; -- segment definition -- +; +%ifdef __YASM_VER__ +%define SEG_TEXT .text align=32 +%define SEG_CONST .rdata align=32 +%else +%define SEG_TEXT .text align=32 public use64 class=CODE +%define SEG_CONST .rdata align=32 public use64 class=CONST +%endif +%define EXTN(name) name ; foo() -> foo + +%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- +; * Borland C++ (Win32) + +; -- segment definition -- +; +%define SEG_TEXT _text align=32 public use32 class=CODE +%define SEG_CONST _data align=32 public use32 class=DATA + +%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ +; * Linux +; * *BSD family Unix using elf format +; * Unix System V, including Solaris x86, UnixWare and SCO Unix + +; mark stack as non-executable +section .note.GNU-stack noalloc noexec nowrite progbits + +; -- segment definition -- +; +%ifdef __x86_64__ +%define SEG_TEXT .text progbits align=32 +%define SEG_CONST .rodata progbits align=32 +%else +%define SEG_TEXT .text progbits alloc exec nowrite align=32 +%define SEG_CONST .rodata progbits alloc noexec nowrite align=32 +%endif + +; To make the code position-independent, append -DPIC to the commandline +; +%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC +%define EXTN(name) name ; foo() -> foo + +%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- +; * Older Linux using a.out format (nasm -f aout -DAOUT ...) +; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) + +; -- segment definition -- +; +%define SEG_TEXT .text +%define SEG_CONST .data + +; To make the code position-independent, append -DPIC to the commandline +; +%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC + +%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- +; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) + +; -- segment definition -- +; +%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why? +%define SEG_CONST .rodata align=32 + +; The generation of position-independent code (PIC) is the default on Darwin. +; +%define PIC +%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing + +%else ; ----(Other case)---------------------- + +; -- segment definition -- +; +%define SEG_TEXT .text +%define SEG_CONST .data + +%endif ; ---------------------------------------------- + +; ========================================================================== + +; -------------------------------------------------------------------------- +; Common types +; +%ifdef __x86_64__ +%ifnidn __OUTPUT_FORMAT__, elfx32 +%define POINTER qword ; general pointer type +%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) +%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT +%define resp resq +%define dp dq +%define raxp rax +%define rbxp rbx +%define rcxp rcx +%define rdxp rdx +%define rsip rsi +%define rdip rdi +%define rbpp rbp +%define rspp rsp +%define r8p r8 +%define r9p r9 +%define r10p r10 +%define r11p r11 +%define r12p r12 +%define r13p r13 +%define r14p r14 +%define r15p r15 +%endif +%endif +%ifndef raxp +%define POINTER dword ; general pointer type +%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) +%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT +%define resp resd +%define dp dd +; x86_64 ILP32 ABI (x32) +%define raxp eax +%define rbxp ebx +%define rcxp ecx +%define rdxp edx +%define rsip esi +%define rdip edi +%define rbpp ebp +%define rspp esp +%define r8p r8d +%define r9p r9d +%define r10p r10d +%define r11p r11d +%define r12p r12d +%define r13p r13d +%define r14p r14d +%define r15p r15d +%endif + +%define INT dword ; signed integer type +%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) +%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT + +%define FP32 dword ; IEEE754 single +%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) +%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT + +%define MMWORD qword ; int64 (MMX register) +%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) +%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT + +; NASM is buggy and doesn't properly handle operand sizes for SSE +; instructions, so for now we have to define XMMWORD as blank. +%define XMMWORD ; int128 (SSE register) +%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) +%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT + +%define YMMWORD ; int256 (AVX register) +%define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD) +%define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT + +; Similar hacks for when we load a dword or MMWORD into an xmm# register +%define XMM_DWORD +%define XMM_MMWORD + +%define SIZEOF_BYTE 1 ; sizeof(byte) +%define SIZEOF_WORD 2 ; sizeof(word) +%define SIZEOF_DWORD 4 ; sizeof(dword) +%define SIZEOF_QWORD 8 ; sizeof(qword) +%define SIZEOF_OWORD 16 ; sizeof(oword) +%define SIZEOF_YWORD 32 ; sizeof(yword) + +%define BYTE_BIT 8 ; CHAR_BIT in C +%define WORD_BIT 16 ; sizeof(word)*BYTE_BIT +%define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT +%define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT +%define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT +%define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT + +; -------------------------------------------------------------------------- +; External Symbol Name +; +%ifndef EXTN +%define EXTN(name) _ %+ name ; foo() -> _foo +%endif + +; -------------------------------------------------------------------------- +; Hidden symbols +; +%ifdef ELF ; ----(nasm -felf[64] -DELF ...)-------- +%define GLOBAL_FUNCTION(name) global EXTN(name):function hidden +%define GLOBAL_DATA(name) global EXTN(name):data hidden +%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- +%ifdef __YASM_VER__ +%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern +%define GLOBAL_DATA(name) global EXTN(name):private_extern +%else +%if __NASM_VERSION_ID__ >= 0x020E0000 +%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern +%define GLOBAL_DATA(name) global EXTN(name):private_extern +%endif +%endif +%endif + +%ifndef GLOBAL_FUNCTION +%define GLOBAL_FUNCTION(name) global EXTN(name) +%endif +%ifndef GLOBAL_DATA +%define GLOBAL_DATA(name) global EXTN(name) +%endif + +; -------------------------------------------------------------------------- +; Macros for position-independent code (PIC) support +; +%ifndef GOT_SYMBOL +%undef PIC +%endif + +%ifdef PIC ; ------------------------------------------- + +%ifidn GOT_SYMBOL, _MACHO_PIC_ ; -------------------- + +; At present, nasm doesn't seem to support PIC generation for Mach-O. +; The PIC support code below is a little tricky. + + SECTION SEG_CONST +const_base: + +%define GOTOFF(got, sym) (got) + (sym) - const_base + +%imacro get_GOT 1 + ; NOTE: this macro destroys ecx resister. + call %%geteip + add ecx, byte (%%ref - $) + jmp short %%adjust +%%geteip: + mov ecx, POINTER [esp] + ret +%%adjust: + push ebp + xor ebp, ebp ; ebp = 0 +%ifidni %1, ebx ; (%1 == ebx) + ; db 0x8D,0x9C + jmp near const_base = + ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) + db 0x8D, 0x9C ; 8D,9C + jmp near const_base ; E9,(const_base-%%ref) +%%ref: +%else ; (%1 != ebx) + ; db 0x8D,0x8C + jmp near const_base = + ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) + db 0x8D, 0x8C ; 8D,8C + jmp near const_base ; E9,(const_base-%%ref) +%%ref: + mov %1, ecx +%endif ; (%1 == ebx) + pop ebp +%endmacro + +%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- + +%define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff + +%imacro get_GOT 1 + extern GOT_SYMBOL + call %%geteip + add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc + jmp short %%done +%%geteip: + mov %1, POINTER [esp] + ret +%%done: +%endmacro + +%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- + +%imacro pushpic 1.nolist + push %1 +%endmacro +%imacro poppic 1.nolist + pop %1 +%endmacro +%imacro movpic 2.nolist + mov %1, %2 +%endmacro + +%else ; !PIC ----------------------------------------- + +%define GOTOFF(got, sym) (sym) + +%imacro get_GOT 1.nolist +%endmacro +%imacro pushpic 1.nolist +%endmacro +%imacro poppic 1.nolist +%endmacro +%imacro movpic 2.nolist +%endmacro + +%endif ; PIC ----------------------------------------- + +; -------------------------------------------------------------------------- +; Align the next instruction on {2,4,8,16,..}-byte boundary. +; ".balign n,,m" in GNU as +; +%define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) +%define FILLB(b, n) (($$-(b)) & ((n)-1)) + +%imacro alignx 1-2.nolist 0xFFFF +%%bs: \ + times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \ + db 0x90 ; nop + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \ + db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \ + db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \ + db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \ + db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \ + db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \ + db 0x8B, 0xED ; mov ebp,ebp + times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \ + db 0x90 ; nop +%endmacro + +; Align the next data on {2,4,8,16,..}-byte boundary. +; +%imacro alignz 1.nolist + align %1, db 0 ; filling zeros +%endmacro + +%ifdef __x86_64__ + +%ifdef WIN64 + +%imacro collect_args 1 + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm6 + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm7 + mov r10, rcx +%if %1 > 1 + mov r11, rdx +%endif +%if %1 > 2 + push r12 + mov r12, r8 +%endif +%if %1 > 3 + push r13 + mov r13, r9 +%endif +%if %1 > 4 + push r14 + mov r14, [rax+48] +%endif +%if %1 > 5 + push r15 + mov r15, [rax+56] +%endif + push rsi + push rdi +%endmacro + +%imacro uncollect_args 1 + pop rdi + pop rsi +%if %1 > 5 + pop r15 +%endif +%if %1 > 4 + pop r14 +%endif +%if %1 > 3 + pop r13 +%endif +%if %1 > 2 + pop r12 +%endif + movaps xmm7, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD + movaps xmm6, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD +%endmacro + +%imacro push_xmm 1 + sub rsp, %1 * SIZEOF_XMMWORD + movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8 +%if %1 > 1 + movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9 +%endif +%if %1 > 2 + movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10 +%endif +%if %1 > 3 + movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11 +%endif +%endmacro + +%imacro pop_xmm 1 + movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD] +%if %1 > 1 + movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD] +%endif +%if %1 > 2 + movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD] +%endif +%if %1 > 3 + movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD] +%endif + add rsp, %1 * SIZEOF_XMMWORD +%endmacro + +%else + +%imacro collect_args 1 + push r10 + mov r10, rdi +%if %1 > 1 + push r11 + mov r11, rsi +%endif +%if %1 > 2 + push r12 + mov r12, rdx +%endif +%if %1 > 3 + push r13 + mov r13, rcx +%endif +%if %1 > 4 + push r14 + mov r14, r8 +%endif +%if %1 > 5 + push r15 + mov r15, r9 +%endif +%endmacro + +%imacro uncollect_args 1 +%if %1 > 5 + pop r15 +%endif +%if %1 > 4 + pop r14 +%endif +%if %1 > 3 + pop r13 +%endif +%if %1 > 2 + pop r12 +%endif +%if %1 > 1 + pop r11 +%endif + pop r10 +%endmacro + +%imacro push_xmm 1 +%endmacro + +%imacro pop_xmm 1 +%endmacro + +%endif + +%endif + +; -------------------------------------------------------------------------- +; Defines picked up from the C headers +; +%include "jsimdcfg.inc" + +; -------------------------------------------------------------------------- |