summaryrefslogtreecommitdiffstats
path: root/media/libjpeg/simd/nasm
diff options
context:
space:
mode:
Diffstat (limited to 'media/libjpeg/simd/nasm')
-rw-r--r--media/libjpeg/simd/nasm/jcolsamp.inc135
-rw-r--r--media/libjpeg/simd/nasm/jdct.inc31
-rw-r--r--media/libjpeg/simd/nasm/jsimdcfg.inc93
-rw-r--r--media/libjpeg/simd/nasm/jsimdcfg.inc.h133
-rw-r--r--media/libjpeg/simd/nasm/jsimdext.inc520
5 files changed, 912 insertions, 0 deletions
diff --git a/media/libjpeg/simd/nasm/jcolsamp.inc b/media/libjpeg/simd/nasm/jcolsamp.inc
new file mode 100644
index 0000000000..6f6d7f29d1
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jcolsamp.inc
@@ -0,0 +1,135 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_RED == 0
+%define mmA mm0
+%define mmB mm1
+%define xmmA xmm0
+%define xmmB xmm1
+%define ymmA ymm0
+%define ymmB ymm1
+%elif RGB_GREEN == 0
+%define mmA mm2
+%define mmB mm3
+%define xmmA xmm2
+%define xmmB xmm3
+%define ymmA ymm2
+%define ymmB ymm3
+%elif RGB_BLUE == 0
+%define mmA mm4
+%define mmB mm5
+%define xmmA xmm4
+%define xmmB xmm5
+%define ymmA ymm4
+%define ymmB ymm5
+%else
+%define mmA mm6
+%define mmB mm7
+%define xmmA xmm6
+%define xmmB xmm7
+%define ymmA ymm6
+%define ymmB ymm7
+%endif
+
+%if RGB_RED == 1
+%define mmC mm0
+%define mmD mm1
+%define xmmC xmm0
+%define xmmD xmm1
+%define ymmC ymm0
+%define ymmD ymm1
+%elif RGB_GREEN == 1
+%define mmC mm2
+%define mmD mm3
+%define xmmC xmm2
+%define xmmD xmm3
+%define ymmC ymm2
+%define ymmD ymm3
+%elif RGB_BLUE == 1
+%define mmC mm4
+%define mmD mm5
+%define xmmC xmm4
+%define xmmD xmm5
+%define ymmC ymm4
+%define ymmD ymm5
+%else
+%define mmC mm6
+%define mmD mm7
+%define xmmC xmm6
+%define xmmD xmm7
+%define ymmC ymm6
+%define ymmD ymm7
+%endif
+
+%if RGB_RED == 2
+%define mmE mm0
+%define mmF mm1
+%define xmmE xmm0
+%define xmmF xmm1
+%define ymmE ymm0
+%define ymmF ymm1
+%elif RGB_GREEN == 2
+%define mmE mm2
+%define mmF mm3
+%define xmmE xmm2
+%define xmmF xmm3
+%define ymmE ymm2
+%define ymmF ymm3
+%elif RGB_BLUE == 2
+%define mmE mm4
+%define mmF mm5
+%define xmmE xmm4
+%define xmmF xmm5
+%define ymmE ymm4
+%define ymmF ymm5
+%else
+%define mmE mm6
+%define mmF mm7
+%define xmmE xmm6
+%define xmmF xmm7
+%define ymmE ymm6
+%define ymmF ymm7
+%endif
+
+%if RGB_RED == 3
+%define mmG mm0
+%define mmH mm1
+%define xmmG xmm0
+%define xmmH xmm1
+%define ymmG ymm0
+%define ymmH ymm1
+%elif RGB_GREEN == 3
+%define mmG mm2
+%define mmH mm3
+%define xmmG xmm2
+%define xmmH xmm3
+%define ymmG ymm2
+%define ymmH ymm3
+%elif RGB_BLUE == 3
+%define mmG mm4
+%define mmH mm5
+%define xmmG xmm4
+%define xmmH xmm5
+%define ymmG ymm4
+%define ymmH ymm5
+%else
+%define mmG mm6
+%define mmH mm7
+%define xmmG xmm6
+%define xmmH xmm7
+%define ymmG ymm6
+%define ymmH ymm7
+%endif
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/nasm/jdct.inc b/media/libjpeg/simd/nasm/jdct.inc
new file mode 100644
index 0000000000..9192f66f0c
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jdct.inc
@@ -0,0 +1,31 @@
+;
+; jdct.inc - private declarations for forward & reverse DCT subsystems
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2018, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+
+; Each IDCT routine is responsible for range-limiting its results and
+; converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could
+; be quite far out of range if the input data is corrupt, so a bulletproof
+; range-limiting step is required. We use a mask-and-table-lookup method
+; to do the combined operations quickly.
+;
+%define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples
+
+%define ROW(n, b, s) ((b) + (n) * (s))
+%define COL(n, b, s) ((b) + (n) * (s) * DCTSIZE)
+
+%define DWBLOCK(m, n, b, s) \
+ ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_DWORD)
+%define MMBLOCK(m, n, b, s) \
+ ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_MMWORD)
+%define XMMBLOCK(m, n, b, s) \
+ ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_XMMWORD)
+%define YMMBLOCK(m, n, b, s) \
+ ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_YMMWORD)
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/nasm/jsimdcfg.inc b/media/libjpeg/simd/nasm/jsimdcfg.inc
new file mode 100644
index 0000000000..667024a5f9
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdcfg.inc
@@ -0,0 +1,93 @@
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+;
+; -- jpeglib.h
+;
+%define DCTSIZE 8
+%define DCTSIZE2 64
+;
+; -- jmorecfg.h
+;
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define EXT_RGB_RED 0
+%define EXT_RGB_GREEN 1
+%define EXT_RGB_BLUE 2
+%define EXT_RGB_PIXELSIZE 3
+%define EXT_RGBX_RED 0
+%define EXT_RGBX_GREEN 1
+%define EXT_RGBX_BLUE 2
+%define EXT_RGBX_PIXELSIZE 4
+%define EXT_BGR_RED 2
+%define EXT_BGR_GREEN 1
+%define EXT_BGR_BLUE 0
+%define EXT_BGR_PIXELSIZE 3
+%define EXT_BGRX_RED 2
+%define EXT_BGRX_GREEN 1
+%define EXT_BGRX_BLUE 0
+%define EXT_BGRX_PIXELSIZE 4
+%define EXT_XBGR_RED 3
+%define EXT_XBGR_GREEN 2
+%define EXT_XBGR_BLUE 1
+%define EXT_XBGR_PIXELSIZE 4
+%define EXT_XRGB_RED 1
+%define EXT_XRGB_GREEN 2
+%define EXT_XRGB_BLUE 3
+%define EXT_XRGB_PIXELSIZE 4
+%define RGBX_FILLER_0XFF 1
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+%define JSAMPLE byte ; unsigned char
+%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
+%define CENTERJSAMPLE 128
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF word ; short
+%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION dword ; unsigned int
+%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
+%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
+%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
+%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
+%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
+%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
+;
+; -- jdct.h
+;
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM word ; short
+%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
+%define float FP32 ; float
+%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)
+; To maximize parallelism, Type short is changed to short.
+;
+%define ISLOW_MULT_TYPE word ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
+%define IFAST_MULT_TYPE word ; must be short
+%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
+%define FLOAT_MULT_TYPE FP32 ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
+;
+; -- jsimd.h
+;
+%define JSIMD_NONE 0x00
+%define JSIMD_MMX 0x01
+%define JSIMD_3DNOW 0x02
+%define JSIMD_SSE 0x04
+%define JSIMD_SSE2 0x08
+%define JSIMD_AVX2 0x80
diff --git a/media/libjpeg/simd/nasm/jsimdcfg.inc.h b/media/libjpeg/simd/nasm/jsimdcfg.inc.h
new file mode 100644
index 0000000000..bf2a45ad50
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdcfg.inc.h
@@ -0,0 +1,133 @@
+/*
+ * This file generates the include file for the assembly
+ * implementations by abusing the C preprocessor.
+ *
+ * Note: Some things are manually defined as they need to
+ * be mapped to NASM types.
+ */
+
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+
+#define JPEG_INTERNALS
+
+#include "../jpeglib.h"
+#include "../jconfig.h"
+#include "../jmorecfg.h"
+#include "jsimd.h"
+
+;
+; -- jpeglib.h
+;
+
+%define _cpp_protection_DCTSIZE DCTSIZE
+%define _cpp_protection_DCTSIZE2 DCTSIZE2
+
+;
+; -- jmorecfg.h
+;
+
+%define _cpp_protection_RGB_RED RGB_RED
+%define _cpp_protection_RGB_GREEN RGB_GREEN
+%define _cpp_protection_RGB_BLUE RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
+
+%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
+
+%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
+
+%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
+
+%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+
+%define RGBX_FILLER_0XFF 1
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+
+%define JSAMPLE byte ; unsigned char
+%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
+
+%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF word ; short
+%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION dword ; unsigned int
+%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
+
+%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
+%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
+%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
+%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
+%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
+
+;
+; -- jdct.h
+;
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM word ; short
+%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
+
+%define FAST_FLOAT FP32 ; float
+%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define ISLOW_MULT_TYPE word ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
+
+%define IFAST_MULT_TYPE word ; must be short
+%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
+
+%define FLOAT_MULT_TYPE FP32 ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
+
+;
+; -- jsimd.h
+;
+
+%define _cpp_protection_JSIMD_NONE JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
+%define _cpp_protection_JSIMD_AVX2 JSIMD_AVX2
diff --git a/media/libjpeg/simd/nasm/jsimdext.inc b/media/libjpeg/simd/nasm/jsimdext.inc
new file mode 100644
index 0000000000..e8d50b0349
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdext.inc
@@ -0,0 +1,520 @@
+;
+; jsimdext.inc - common declarations
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
+; Copyright (C) 2018, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+; ==========================================================================
+; System-dependent configurations
+
+%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT .text align=32
+%define SEG_CONST .rdata align=32
+%else
+%define SEG_TEXT .text align=32 public use32 class=CODE
+%define SEG_CONST .rdata align=32 public use32 class=CONST
+%endif
+
+%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
+; * Microsoft Visual C++
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT .text align=32
+%define SEG_CONST .rdata align=32
+%else
+%define SEG_TEXT .text align=32 public use64 class=CODE
+%define SEG_CONST .rdata align=32 public use64 class=CONST
+%endif
+%define EXTN(name) name ; foo() -> foo
+
+%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT _text align=32 public use32 class=CODE
+%define SEG_CONST _data align=32 public use32 class=DATA
+
+%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; mark stack as non-executable
+section .note.GNU-stack noalloc noexec nowrite progbits
+
+; -- segment definition --
+;
+%ifdef __x86_64__
+%define SEG_TEXT .text progbits align=32
+%define SEG_CONST .rodata progbits align=32
+%else
+%define SEG_TEXT .text progbits alloc exec nowrite align=32
+%define SEG_CONST .rodata progbits alloc noexec nowrite align=32
+%endif
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
+%define EXTN(name) name ; foo() -> foo
+
+%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT .text
+%define SEG_CONST .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
+
+%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why?
+%define SEG_CONST .rodata align=32
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
+
+%else ; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT .text
+%define SEG_CONST .data
+
+%endif ; ----------------------------------------------
+
+; ==========================================================================
+
+; --------------------------------------------------------------------------
+; Common types
+;
+%ifdef __x86_64__
+%ifnidn __OUTPUT_FORMAT__, elfx32
+%define POINTER qword ; general pointer type
+%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
+%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
+%define resp resq
+%define dp dq
+%define raxp rax
+%define rbxp rbx
+%define rcxp rcx
+%define rdxp rdx
+%define rsip rsi
+%define rdip rdi
+%define rbpp rbp
+%define rspp rsp
+%define r8p r8
+%define r9p r9
+%define r10p r10
+%define r11p r11
+%define r12p r12
+%define r13p r13
+%define r14p r14
+%define r15p r15
+%endif
+%endif
+%ifndef raxp
+%define POINTER dword ; general pointer type
+%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
+%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
+%define resp resd
+%define dp dd
+; x86_64 ILP32 ABI (x32)
+%define raxp eax
+%define rbxp ebx
+%define rcxp ecx
+%define rdxp edx
+%define rsip esi
+%define rdip edi
+%define rbpp ebp
+%define rspp esp
+%define r8p r8d
+%define r9p r9d
+%define r10p r10d
+%define r11p r11d
+%define r12p r12d
+%define r13p r13d
+%define r14p r14d
+%define r15p r15d
+%endif
+
+%define INT dword ; signed integer type
+%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
+%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
+
+%define FP32 dword ; IEEE754 single
+%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
+%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
+
+%define MMWORD qword ; int64 (MMX register)
+%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
+%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
+
+; NASM is buggy and doesn't properly handle operand sizes for SSE
+; instructions, so for now we have to define XMMWORD as blank.
+%define XMMWORD ; int128 (SSE register)
+%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
+%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
+
+%define YMMWORD ; int256 (AVX register)
+%define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD)
+%define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT
+
+; Similar hacks for when we load a dword or MMWORD into an xmm# register
+%define XMM_DWORD
+%define XMM_MMWORD
+
+%define SIZEOF_BYTE 1 ; sizeof(byte)
+%define SIZEOF_WORD 2 ; sizeof(word)
+%define SIZEOF_DWORD 4 ; sizeof(dword)
+%define SIZEOF_QWORD 8 ; sizeof(qword)
+%define SIZEOF_OWORD 16 ; sizeof(oword)
+%define SIZEOF_YWORD 32 ; sizeof(yword)
+
+%define BYTE_BIT 8 ; CHAR_BIT in C
+%define WORD_BIT 16 ; sizeof(word)*BYTE_BIT
+%define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT
+%define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT
+%define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT
+%define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT
+
+; --------------------------------------------------------------------------
+; External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name) _ %+ name ; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+; Hidden symbols
+;
+%ifdef ELF ; ----(nasm -felf[64] -DELF ...)--------
+%define GLOBAL_FUNCTION(name) global EXTN(name):function hidden
+%define GLOBAL_DATA(name) global EXTN(name):data hidden
+%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
+%ifdef __YASM_VER__
+%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
+%define GLOBAL_DATA(name) global EXTN(name):private_extern
+%else
+%if __NASM_VERSION_ID__ >= 0x020E0000
+%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
+%define GLOBAL_DATA(name) global EXTN(name):private_extern
+%endif
+%endif
+%endif
+
+%ifndef GLOBAL_FUNCTION
+%define GLOBAL_FUNCTION(name) global EXTN(name)
+%endif
+%ifndef GLOBAL_DATA
+%define GLOBAL_DATA(name) global EXTN(name)
+%endif
+
+; --------------------------------------------------------------------------
+; Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC ; -------------------------------------------
+
+%ifidn GOT_SYMBOL, _MACHO_PIC_ ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+ SECTION SEG_CONST
+const_base:
+
+%define GOTOFF(got, sym) (got) + (sym) - const_base
+
+%imacro get_GOT 1
+ ; NOTE: this macro destroys ecx resister.
+ call %%geteip
+ add ecx, byte (%%ref - $)
+ jmp short %%adjust
+%%geteip:
+ mov ecx, POINTER [esp]
+ ret
+%%adjust:
+ push ebp
+ xor ebp, ebp ; ebp = 0
+%ifidni %1, ebx ; (%1 == ebx)
+ ; db 0x8D,0x9C + jmp near const_base =
+ ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+ db 0x8D, 0x9C ; 8D,9C
+ jmp near const_base ; E9,(const_base-%%ref)
+%%ref:
+%else ; (%1 != ebx)
+ ; db 0x8D,0x8C + jmp near const_base =
+ ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+ db 0x8D, 0x8C ; 8D,8C
+ jmp near const_base ; E9,(const_base-%%ref)
+%%ref:
+ mov %1, ecx
+%endif ; (%1 == ebx)
+ pop ebp
+%endmacro
+
+%else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT 1
+ extern GOT_SYMBOL
+ call %%geteip
+ add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+ jmp short %%done
+%%geteip:
+ mov %1, POINTER [esp]
+ ret
+%%done:
+%endmacro
+
+%endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic 1.nolist
+ push %1
+%endmacro
+%imacro poppic 1.nolist
+ pop %1
+%endmacro
+%imacro movpic 2.nolist
+ mov %1, %2
+%endmacro
+
+%else ; !PIC -----------------------------------------
+
+%define GOTOFF(got, sym) (sym)
+
+%imacro get_GOT 1.nolist
+%endmacro
+%imacro pushpic 1.nolist
+%endmacro
+%imacro poppic 1.nolist
+%endmacro
+%imacro movpic 2.nolist
+%endmacro
+
+%endif ; PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+; Align the next instruction on {2,4,8,16,..}-byte boundary.
+; ".balign n,,m" in GNU as
+;
+%define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b, n) (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs: \
+ times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
+ db 0x90 ; nop
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
+ db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
+ db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
+ db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
+ db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
+ db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00]
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
+ db 0x8B, 0xED ; mov ebp,ebp
+ times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
+ db 0x90 ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+ align %1, db 0 ; filling zeros
+%endmacro
+
+%ifdef __x86_64__
+
+%ifdef WIN64
+
+%imacro collect_args 1
+ sub rsp, SIZEOF_XMMWORD
+ movaps XMMWORD [rsp], xmm6
+ sub rsp, SIZEOF_XMMWORD
+ movaps XMMWORD [rsp], xmm7
+ mov r10, rcx
+%if %1 > 1
+ mov r11, rdx
+%endif
+%if %1 > 2
+ push r12
+ mov r12, r8
+%endif
+%if %1 > 3
+ push r13
+ mov r13, r9
+%endif
+%if %1 > 4
+ push r14
+ mov r14, [rax+48]
+%endif
+%if %1 > 5
+ push r15
+ mov r15, [rax+56]
+%endif
+ push rsi
+ push rdi
+%endmacro
+
+%imacro uncollect_args 1
+ pop rdi
+ pop rsi
+%if %1 > 5
+ pop r15
+%endif
+%if %1 > 4
+ pop r14
+%endif
+%if %1 > 3
+ pop r13
+%endif
+%if %1 > 2
+ pop r12
+%endif
+ movaps xmm7, XMMWORD [rsp]
+ add rsp, SIZEOF_XMMWORD
+ movaps xmm6, XMMWORD [rsp]
+ add rsp, SIZEOF_XMMWORD
+%endmacro
+
+%imacro push_xmm 1
+ sub rsp, %1 * SIZEOF_XMMWORD
+ movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
+%if %1 > 1
+ movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
+%endif
+%if %1 > 2
+ movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
+%endif
+%if %1 > 3
+ movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
+%endif
+%endmacro
+
+%imacro pop_xmm 1
+ movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+%if %1 > 1
+ movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+%endif
+%if %1 > 2
+ movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+%endif
+%if %1 > 3
+ movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
+%endif
+ add rsp, %1 * SIZEOF_XMMWORD
+%endmacro
+
+%else
+
+%imacro collect_args 1
+ push r10
+ mov r10, rdi
+%if %1 > 1
+ push r11
+ mov r11, rsi
+%endif
+%if %1 > 2
+ push r12
+ mov r12, rdx
+%endif
+%if %1 > 3
+ push r13
+ mov r13, rcx
+%endif
+%if %1 > 4
+ push r14
+ mov r14, r8
+%endif
+%if %1 > 5
+ push r15
+ mov r15, r9
+%endif
+%endmacro
+
+%imacro uncollect_args 1
+%if %1 > 5
+ pop r15
+%endif
+%if %1 > 4
+ pop r14
+%endif
+%if %1 > 3
+ pop r13
+%endif
+%if %1 > 2
+ pop r12
+%endif
+%if %1 > 1
+ pop r11
+%endif
+ pop r10
+%endmacro
+
+%imacro push_xmm 1
+%endmacro
+
+%imacro pop_xmm 1
+%endmacro
+
+%endif
+
+%endif
+
+; --------------------------------------------------------------------------
+; Defines picked up from the C headers
+;
+%include "jsimdcfg.inc"
+
+; --------------------------------------------------------------------------