diff options
Diffstat (limited to 'src/VBox/VMM/VMMAll/IEMAllAImpl.asm')
-rw-r--r-- | src/VBox/VMM/VMMAll/IEMAllAImpl.asm | 3024 |
1 files changed, 3024 insertions, 0 deletions
diff --git a/src/VBox/VMM/VMMAll/IEMAllAImpl.asm b/src/VBox/VMM/VMMAll/IEMAllAImpl.asm new file mode 100644 index 00000000..fc4ed29a --- /dev/null +++ b/src/VBox/VMM/VMMAll/IEMAllAImpl.asm @@ -0,0 +1,3024 @@ +; $Id: IEMAllAImpl.asm $ +;; @file +; IEM - Instruction Implementation in Assembly. +; + +; +; Copyright (C) 2011-2020 Oracle Corporation +; +; This file is part of VirtualBox Open Source Edition (OSE), as +; available from http://www.virtualbox.org. This file is free software; +; you can redistribute it and/or modify it under the terms of the GNU +; General Public License (GPL) as published by the Free Software +; Foundation, in version 2 as it comes in the "COPYING" file of the +; VirtualBox OSE distribution. VirtualBox OSE is distributed in the +; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. +; + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Header Files ; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%include "VBox/asmdefs.mac" +%include "VBox/err.mac" +%include "iprt/x86.mac" + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Defined Constants And Macros ; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; +; RET XX / RET wrapper for fastcall. +; +%macro RET_FASTCALL 1 +%ifdef RT_ARCH_X86 + %ifdef RT_OS_WINDOWS + ret %1 + %else + ret + %endif +%else + ret +%endif +%endmacro + +;; +; NAME for fastcall functions. +; +;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar +; escaping (or whatever the dollar is good for here). Thus the ugly +; prefix argument. +; +%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name) +%ifdef RT_ARCH_X86 + %ifdef RT_OS_WINDOWS + %undef NAME_FASTCALL + %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs + %endif +%endif + +;; +; BEGINPROC for fastcall functions. +; +; @param 1 The function name (C). +; @param 2 The argument size on x86. +; +%macro BEGINPROC_FASTCALL 2 + %ifdef ASM_FORMAT_PE + export %1=NAME_FASTCALL(%1,%2,$@) + %endif + %ifdef __NASM__ + %ifdef ASM_FORMAT_OMF + export NAME(%1) NAME_FASTCALL(%1,%2,$@) + %endif + %endif + %ifndef ASM_FORMAT_BIN + global NAME_FASTCALL(%1,%2,$@) + %endif +NAME_FASTCALL(%1,%2,@): +%endmacro + + +; +; We employ some macro assembly here to hid the calling convention differences. +; +%ifdef RT_ARCH_AMD64 + %macro PROLOGUE_1_ARGS 0 + %endmacro + %macro EPILOGUE_1_ARGS 0 + ret + %endmacro + %macro EPILOGUE_1_ARGS_EX 0 + ret + %endmacro + + %macro PROLOGUE_2_ARGS 0 + %endmacro + %macro EPILOGUE_2_ARGS 0 + ret + %endmacro + %macro EPILOGUE_2_ARGS_EX 1 + ret + %endmacro + + %macro PROLOGUE_3_ARGS 0 + %endmacro + %macro EPILOGUE_3_ARGS 0 + ret + %endmacro + %macro EPILOGUE_3_ARGS_EX 1 + ret + %endmacro + + %macro PROLOGUE_4_ARGS 0 + %endmacro + %macro EPILOGUE_4_ARGS 0 + ret + %endmacro + %macro EPILOGUE_4_ARGS_EX 1 + ret + %endmacro + + %ifdef ASM_CALL64_GCC + %define A0 rdi + %define A0_32 edi + %define A0_16 di + %define A0_8 dil + + %define A1 rsi + %define A1_32 esi + %define A1_16 si + %define A1_8 sil + + %define A2 rdx + %define A2_32 edx + %define A2_16 dx + %define A2_8 dl + + %define A3 rcx + %define A3_32 ecx + %define A3_16 cx + %endif + + %ifdef ASM_CALL64_MSC + %define A0 rcx + %define A0_32 ecx + %define A0_16 cx + %define A0_8 cl + + %define A1 rdx + %define A1_32 edx + %define A1_16 dx + %define A1_8 dl + + %define A2 r8 + %define A2_32 r8d + %define A2_16 r8w + %define A2_8 r8b + + %define A3 r9 + %define A3_32 r9d + %define A3_16 r9w + %endif + + %define T0 rax + %define T0_32 eax + %define T0_16 ax + %define T0_8 al + + %define T1 r11 + %define T1_32 r11d + %define T1_16 r11w + %define T1_8 r11b + +%else + ; x86 + %macro PROLOGUE_1_ARGS 0 + push edi + %endmacro + %macro EPILOGUE_1_ARGS 0 + pop edi + ret 0 + %endmacro + %macro EPILOGUE_1_ARGS_EX 1 + pop edi + ret %1 + %endmacro + + %macro PROLOGUE_2_ARGS 0 + push edi + %endmacro + %macro EPILOGUE_2_ARGS 0 + pop edi + ret 0 + %endmacro + %macro EPILOGUE_2_ARGS_EX 1 + pop edi + ret %1 + %endmacro + + %macro PROLOGUE_3_ARGS 0 + push ebx + mov ebx, [esp + 4 + 4] + push edi + %endmacro + %macro EPILOGUE_3_ARGS_EX 1 + %if (%1) < 4 + %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)." + %endif + pop edi + pop ebx + ret %1 + %endmacro + %macro EPILOGUE_3_ARGS 0 + EPILOGUE_3_ARGS_EX 4 + %endmacro + + %macro PROLOGUE_4_ARGS 0 + push ebx + push edi + push esi + mov ebx, [esp + 12 + 4 + 0] + mov esi, [esp + 12 + 4 + 4] + %endmacro + %macro EPILOGUE_4_ARGS_EX 1 + %if (%1) < 8 + %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)." + %endif + pop esi + pop edi + pop ebx + ret %1 + %endmacro + %macro EPILOGUE_4_ARGS 0 + EPILOGUE_4_ARGS_EX 8 + %endmacro + + %define A0 ecx + %define A0_32 ecx + %define A0_16 cx + %define A0_8 cl + + %define A1 edx + %define A1_32 edx + %define A1_16 dx + %define A1_8 dl + + %define A2 ebx + %define A2_32 ebx + %define A2_16 bx + %define A2_8 bl + + %define A3 esi + %define A3_32 esi + %define A3_16 si + + %define T0 eax + %define T0_32 eax + %define T0_16 ax + %define T0_8 al + + %define T1 edi + %define T1_32 edi + %define T1_16 di +%endif + + +;; +; Load the relevant flags from [%1] if there are undefined flags (%3). +; +; @remarks Clobbers T0, stack. Changes EFLAGS. +; @param A2 The register pointing to the flags. +; @param 1 The parameter (A0..A3) pointing to the eflags. +; @param 2 The set of modified flags. +; @param 3 The set of undefined flags. +; +%macro IEM_MAYBE_LOAD_FLAGS 3 + ;%if (%3) != 0 + pushf ; store current flags + mov T0_32, [%1] ; load the guest flags + and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags + and T0_32, (%2 | %3) ; select the modified and undefined flags. + or [xSP], T0 ; merge guest flags with host flags. + popf ; load the mixed flags. + ;%endif +%endmacro + +;; +; Update the flag. +; +; @remarks Clobbers T0, T1, stack. +; @param 1 The register pointing to the EFLAGS. +; @param 2 The mask of modified flags to save. +; @param 3 The mask of undefined flags to (maybe) save. +; +%macro IEM_SAVE_FLAGS 3 + %if (%2 | %3) != 0 + pushf + pop T1 + mov T0_32, [%1] ; flags + and T0_32, ~(%2 | %3) ; clear the modified & undefined flags. + and T1_32, (%2 | %3) ; select the modified and undefined flags. + or T0_32, T1_32 ; combine the flags. + mov [%1], T0_32 ; save the flags. + %endif +%endmacro + + +;; +; Macro for implementing a binary operator. +; +; This will generate code for the 8, 16, 32 and 64 bit accesses with locked +; variants, except on 32-bit system where the 64-bit accesses requires hand +; coding. +; +; All the functions takes a pointer to the destination memory operand in A0, +; the source register operand in A1 and a pointer to eflags in A2. +; +; @param 1 The instruction mnemonic. +; @param 2 Non-zero if there should be a locked version. +; @param 3 The modified flags. +; @param 4 The undefined flags. +; +%macro IEMIMPL_BIN_OP 4 +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + %1 byte [A0], A1_8 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u8 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + %1 word [A0], A1_16 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u16 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + %1 dword [A0], A1_32 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u32 + + %ifdef RT_ARCH_AMD64 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + %1 qword [A0], A1 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS_EX 8 +ENDPROC iemAImpl_ %+ %1 %+ _u64 + %endif ; RT_ARCH_AMD64 + + %if %2 != 0 ; locked versions requested? + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + lock %1 byte [A0], A1_8 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u8_locked + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + lock %1 word [A0], A1_16 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u16_locked + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + lock %1 dword [A0], A1_32 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u32_locked + + %ifdef RT_ARCH_AMD64 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + lock %1 qword [A0], A1 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS_EX 8 +ENDPROC iemAImpl_ %+ %1 %+ _u64_locked + %endif ; RT_ARCH_AMD64 + %endif ; locked +%endmacro + +; instr,lock,modified-flags. +IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 +IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 +IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 +IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 +IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF +IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF +IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF +IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 +IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF + + +;; +; Macro for implementing a bit operator. +; +; This will generate code for the 16, 32 and 64 bit accesses with locked +; variants, except on 32-bit system where the 64-bit accesses requires hand +; coding. +; +; All the functions takes a pointer to the destination memory operand in A0, +; the source register operand in A1 and a pointer to eflags in A2. +; +; @param 1 The instruction mnemonic. +; @param 2 Non-zero if there should be a locked version. +; @param 3 The modified flags. +; @param 4 The undefined flags. +; +%macro IEMIMPL_BIT_OP 4 +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + %1 word [A0], A1_16 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u16 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + %1 dword [A0], A1_32 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u32 + + %ifdef RT_ARCH_AMD64 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + %1 qword [A0], A1 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS_EX 8 +ENDPROC iemAImpl_ %+ %1 %+ _u64 + %endif ; RT_ARCH_AMD64 + + %if %2 != 0 ; locked versions requested? + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + lock %1 word [A0], A1_16 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u16_locked + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + lock %1 dword [A0], A1_32 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u32_locked + + %ifdef RT_ARCH_AMD64 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %3, %4 + lock %1 qword [A0], A1 + IEM_SAVE_FLAGS A2, %3, %4 + EPILOGUE_3_ARGS_EX 8 +ENDPROC iemAImpl_ %+ %1 %+ _u64_locked + %endif ; RT_ARCH_AMD64 + %endif ; locked +%endmacro +IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) +IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) +IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) +IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) + +;; +; Macro for implementing a bit search operator. +; +; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit +; system where the 64-bit accesses requires hand coding. +; +; All the functions takes a pointer to the destination memory operand in A0, +; the source register operand in A1 and a pointer to eflags in A2. +; +; @param 1 The instruction mnemonic. +; @param 2 The modified flags. +; @param 3 The undefined flags. +; +%macro IEMIMPL_BIT_OP 3 +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %2, %3 + %1 T0_16, A1_16 + jz .unchanged_dst + mov [A0], T0_16 +.unchanged_dst: + IEM_SAVE_FLAGS A2, %2, %3 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u16 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %2, %3 + %1 T0_32, A1_32 + jz .unchanged_dst + mov [A0], T0_32 +.unchanged_dst: + IEM_SAVE_FLAGS A2, %2, %3 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u32 + + %ifdef RT_ARCH_AMD64 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %2, %3 + %1 T0, A1 + jz .unchanged_dst + mov [A0], T0 +.unchanged_dst: + IEM_SAVE_FLAGS A2, %2, %3 + EPILOGUE_3_ARGS_EX 8 +ENDPROC iemAImpl_ %+ %1 %+ _u64 + %endif ; RT_ARCH_AMD64 +%endmacro +IEMIMPL_BIT_OP bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF) +IEMIMPL_BIT_OP bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF) + + +; +; IMUL is also a similar but yet different case (no lock, no mem dst). +; The rDX:rAX variant of imul is handled together with mul further down. +; +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_imul_two_u16, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) + imul A1_16, word [A0] + mov [A0], A1_16 + IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) + EPILOGUE_3_ARGS +ENDPROC iemAImpl_imul_two_u16 + +BEGINPROC_FASTCALL iemAImpl_imul_two_u32, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) + imul A1_32, dword [A0] + mov [A0], A1_32 + IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) + EPILOGUE_3_ARGS +ENDPROC iemAImpl_imul_two_u32 + +%ifdef RT_ARCH_AMD64 +BEGINPROC_FASTCALL iemAImpl_imul_two_u64, 16 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) + imul A1, qword [A0] + mov [A0], A1 + IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) + EPILOGUE_3_ARGS_EX 8 +ENDPROC iemAImpl_imul_two_u64 +%endif ; RT_ARCH_AMD64 + + +; +; XCHG for memory operands. This implies locking. No flag changes. +; +; Each function takes two arguments, first the pointer to the memory, +; then the pointer to the register. They all return void. +; +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_xchg_u8, 8 + PROLOGUE_2_ARGS + mov T0_8, [A1] + xchg [A0], T0_8 + mov [A1], T0_8 + EPILOGUE_2_ARGS +ENDPROC iemAImpl_xchg_u8 + +BEGINPROC_FASTCALL iemAImpl_xchg_u16, 8 + PROLOGUE_2_ARGS + mov T0_16, [A1] + xchg [A0], T0_16 + mov [A1], T0_16 + EPILOGUE_2_ARGS +ENDPROC iemAImpl_xchg_u16 + +BEGINPROC_FASTCALL iemAImpl_xchg_u32, 8 + PROLOGUE_2_ARGS + mov T0_32, [A1] + xchg [A0], T0_32 + mov [A1], T0_32 + EPILOGUE_2_ARGS +ENDPROC iemAImpl_xchg_u32 + +%ifdef RT_ARCH_AMD64 +BEGINPROC_FASTCALL iemAImpl_xchg_u64, 8 + PROLOGUE_2_ARGS + mov T0, [A1] + xchg [A0], T0 + mov [A1], T0 + EPILOGUE_2_ARGS +ENDPROC iemAImpl_xchg_u64 +%endif + + +; +; XADD for memory operands. +; +; Each function takes three arguments, first the pointer to the +; memory/register, then the pointer to the register, and finally a pointer to +; eflags. They all return void. +; +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + mov T0_8, [A1] + xadd [A0], T0_8 + mov [A1], T0_8 + IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_xadd_u8 + +BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + mov T0_16, [A1] + xadd [A0], T0_16 + mov [A1], T0_16 + IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_xadd_u16 + +BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + mov T0_32, [A1] + xadd [A0], T0_32 + mov [A1], T0_32 + IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_xadd_u32 + +%ifdef RT_ARCH_AMD64 +BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + mov T0, [A1] + xadd [A0], T0 + mov [A1], T0 + IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_xadd_u64 +%endif ; RT_ARCH_AMD64 + +BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + mov T0_8, [A1] + lock xadd [A0], T0_8 + mov [A1], T0_8 + IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_xadd_u8_locked + +BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + mov T0_16, [A1] + lock xadd [A0], T0_16 + mov [A1], T0_16 + IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_xadd_u16_locked + +BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + mov T0_32, [A1] + lock xadd [A0], T0_32 + mov [A1], T0_32 + IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_xadd_u32_locked + +%ifdef RT_ARCH_AMD64 +BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + mov T0, [A1] + lock xadd [A0], T0 + mov [A1], T0 + IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_xadd_u64_locked +%endif ; RT_ARCH_AMD64 + + +; +; CMPXCHG8B. +; +; These are tricky register wise, so the code is duplicated for each calling +; convention. +; +; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to! +; +; C-proto: +; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, +; uint32_t *pEFlags)); +; +; Note! Identical to iemAImpl_cmpxchg16b. +; +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16 +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_MSC + push rbx + + mov r11, rdx ; pu64EaxEdx (is also T1) + mov r10, rcx ; pu64Dst + + mov ebx, [r8] + mov ecx, [r8 + 4] + IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax) + mov eax, [r11] + mov edx, [r11 + 4] + + lock cmpxchg8b [r10] + + mov [r11], eax + mov [r11 + 4], edx + IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11) + + pop rbx + ret + %else + push rbx + + mov r10, rcx ; pEFlags + mov r11, rdx ; pu64EbxEcx (is also T1) + + mov ebx, [r11] + mov ecx, [r11 + 4] + IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax) + mov eax, [rsi] + mov edx, [rsi + 4] + + lock cmpxchg8b [rdi] + + mov [rsi], eax + mov [rsi + 4], edx + IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11) + + pop rbx + ret + + %endif +%else + push esi + push edi + push ebx + push ebp + + mov edi, ecx ; pu64Dst + mov esi, edx ; pu64EaxEdx + mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx + mov ebp, [esp + 16 + 4 + 4] ; pEFlags + + mov ebx, [ecx] + mov ecx, [ecx + 4] + IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax) + mov eax, [esi] + mov edx, [esi + 4] + + lock cmpxchg8b [edi] + + mov [esi], eax + mov [esi + 4], edx + IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi) + + pop ebp + pop ebx + pop edi + pop esi + ret 8 +%endif +ENDPROC iemAImpl_cmpxchg8b + +BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16 + ; Lazy bird always lock prefixes cmpxchg8b. + jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@) +ENDPROC iemAImpl_cmpxchg8b_locked + +%ifdef RT_ARCH_AMD64 + +; +; CMPXCHG16B. +; +; These are tricky register wise, so the code is duplicated for each calling +; convention. +; +; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to! +; +; C-proto: +; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx, +; uint32_t *pEFlags)); +; +; Note! Identical to iemAImpl_cmpxchg8b. +; +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16 + %ifdef ASM_CALL64_MSC + push rbx + + mov r11, rdx ; pu64RaxRdx (is also T1) + mov r10, rcx ; pu64Dst + + mov rbx, [r8] + mov rcx, [r8 + 8] + IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax) + mov rax, [r11] + mov rdx, [r11 + 8] + + lock cmpxchg16b [r10] + + mov [r11], rax + mov [r11 + 8], rdx + IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11) + + pop rbx + ret + %else + push rbx + + mov r10, rcx ; pEFlags + mov r11, rdx ; pu64RbxRcx (is also T1) + + mov rbx, [r11] + mov rcx, [r11 + 8] + IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax) + mov rax, [rsi] + mov rdx, [rsi + 8] + + lock cmpxchg16b [rdi] + + mov [rsi], eax + mov [rsi + 8], edx + IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11) + + pop rbx + ret + + %endif +ENDPROC iemAImpl_cmpxchg16b + +BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16 + ; Lazy bird always lock prefixes cmpxchg8b. + jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@) +ENDPROC iemAImpl_cmpxchg16b_locked + +%endif ; RT_ARCH_AMD64 + + +; +; CMPXCHG. +; +; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to! +; +; C-proto: +; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags)); +; +BEGINCODE +%macro IEMIMPL_CMPXCHG 2 +BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16 + PROLOGUE_4_ARGS + IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax) + mov al, [A1] + %1 cmpxchg [A0], A2_8 + mov [A1], al + IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi) + EPILOGUE_4_ARGS +ENDPROC iemAImpl_cmpxchg_u8 %+ %2 + +BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16 + PROLOGUE_4_ARGS + IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax) + mov ax, [A1] + %1 cmpxchg [A0], A2_16 + mov [A1], ax + IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi) + EPILOGUE_4_ARGS +ENDPROC iemAImpl_cmpxchg_u16 %+ %2 + +BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16 + PROLOGUE_4_ARGS + IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax) + mov eax, [A1] + %1 cmpxchg [A0], A2_32 + mov [A1], eax + IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi) + EPILOGUE_4_ARGS +ENDPROC iemAImpl_cmpxchg_u32 %+ %2 + +BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16 +%ifdef RT_ARCH_AMD64 + PROLOGUE_4_ARGS + IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax) + mov rax, [A1] + %1 cmpxchg [A0], A2 + mov [A1], rax + IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi) + EPILOGUE_4_ARGS +%else + ; + ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b. + ; + push esi + push edi + push ebx + push ebp + + mov edi, ecx ; pu64Dst + mov esi, edx ; pu64Rax + mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts! + mov ebp, [esp + 16 + 4 + 4] ; pEFlags + + mov ebx, [ecx] + mov ecx, [ecx + 4] + IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax) + mov eax, [esi] + mov edx, [esi + 4] + + lock cmpxchg8b [edi] + + ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that. + jz .cmpxchg8b_not_equal + cmp eax, eax ; just set the other flags. +.store: + mov [esi], eax + mov [esi + 4], edx + IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi) + + pop ebp + pop ebx + pop edi + pop esi + ret 8 + +.cmpxchg8b_not_equal: + cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation + jne .store + cmp [esi], eax + jmp .store + +%endif +ENDPROC iemAImpl_cmpxchg_u64 %+ %2 +%endmacro ; IEMIMPL_CMPXCHG + +IEMIMPL_CMPXCHG , , +IEMIMPL_CMPXCHG lock, _locked + +;; +; Macro for implementing a unary operator. +; +; This will generate code for the 8, 16, 32 and 64 bit accesses with locked +; variants, except on 32-bit system where the 64-bit accesses requires hand +; coding. +; +; All the functions takes a pointer to the destination memory operand in A0, +; the source register operand in A1 and a pointer to eflags in A2. +; +; @param 1 The instruction mnemonic. +; @param 2 The modified flags. +; @param 3 The undefined flags. +; +%macro IEMIMPL_UNARY_OP 3 +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8 + PROLOGUE_2_ARGS + IEM_MAYBE_LOAD_FLAGS A1, %2, %3 + %1 byte [A0] + IEM_SAVE_FLAGS A1, %2, %3 + EPILOGUE_2_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u8 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8 + PROLOGUE_2_ARGS + IEM_MAYBE_LOAD_FLAGS A1, %2, %3 + lock %1 byte [A0] + IEM_SAVE_FLAGS A1, %2, %3 + EPILOGUE_2_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u8_locked + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8 + PROLOGUE_2_ARGS + IEM_MAYBE_LOAD_FLAGS A1, %2, %3 + %1 word [A0] + IEM_SAVE_FLAGS A1, %2, %3 + EPILOGUE_2_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u16 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8 + PROLOGUE_2_ARGS + IEM_MAYBE_LOAD_FLAGS A1, %2, %3 + lock %1 word [A0] + IEM_SAVE_FLAGS A1, %2, %3 + EPILOGUE_2_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u16_locked + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8 + PROLOGUE_2_ARGS + IEM_MAYBE_LOAD_FLAGS A1, %2, %3 + %1 dword [A0] + IEM_SAVE_FLAGS A1, %2, %3 + EPILOGUE_2_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u32 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8 + PROLOGUE_2_ARGS + IEM_MAYBE_LOAD_FLAGS A1, %2, %3 + lock %1 dword [A0] + IEM_SAVE_FLAGS A1, %2, %3 + EPILOGUE_2_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u32_locked + + %ifdef RT_ARCH_AMD64 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8 + PROLOGUE_2_ARGS + IEM_MAYBE_LOAD_FLAGS A1, %2, %3 + %1 qword [A0] + IEM_SAVE_FLAGS A1, %2, %3 + EPILOGUE_2_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u64 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8 + PROLOGUE_2_ARGS + IEM_MAYBE_LOAD_FLAGS A1, %2, %3 + lock %1 qword [A0] + IEM_SAVE_FLAGS A1, %2, %3 + EPILOGUE_2_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u64_locked + %endif ; RT_ARCH_AMD64 + +%endmacro + +IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0 +IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0 +IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 +IEMIMPL_UNARY_OP not, 0, 0 + + +;; +; Macro for implementing memory fence operation. +; +; No return value, no operands or anything. +; +; @param 1 The instruction. +; +%macro IEMIMPL_MEM_FENCE 1 +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0 + %1 + ret +ENDPROC iemAImpl_ %+ %1 +%endmacro + +IEMIMPL_MEM_FENCE lfence +IEMIMPL_MEM_FENCE sfence +IEMIMPL_MEM_FENCE mfence + +;; +; Alternative for non-SSE2 host. +; +BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0 + push xAX + xchg xAX, [xSP] + add xSP, xCB + ret +ENDPROC iemAImpl_alt_mem_fence + + + +;; +; Macro for implementing a shift operation. +; +; This will generate code for the 8, 16, 32 and 64 bit accesses, except on +; 32-bit system where the 64-bit accesses requires hand coding. +; +; All the functions takes a pointer to the destination memory operand in A0, +; the shift count in A1 and a pointer to eflags in A2. +; +; @param 1 The instruction mnemonic. +; @param 2 The modified flags. +; @param 3 The undefined flags. +; +; Makes ASSUMPTIONS about A0, A1 and A2 assignments. +; +%macro IEMIMPL_SHIFT_OP 3 +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %2, %3 + %ifdef ASM_CALL64_GCC + mov cl, A1_8 + %1 byte [A0], cl + %else + xchg A1, A0 + %1 byte [A1], cl + %endif + IEM_SAVE_FLAGS A2, %2, %3 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u8 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %2, %3 + %ifdef ASM_CALL64_GCC + mov cl, A1_8 + %1 word [A0], cl + %else + xchg A1, A0 + %1 word [A1], cl + %endif + IEM_SAVE_FLAGS A2, %2, %3 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u16 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %2, %3 + %ifdef ASM_CALL64_GCC + mov cl, A1_8 + %1 dword [A0], cl + %else + xchg A1, A0 + %1 dword [A1], cl + %endif + IEM_SAVE_FLAGS A2, %2, %3 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u32 + + %ifdef RT_ARCH_AMD64 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %2, %3 + %ifdef ASM_CALL64_GCC + mov cl, A1_8 + %1 qword [A0], cl + %else + xchg A1, A0 + %1 qword [A1], cl + %endif + IEM_SAVE_FLAGS A2, %2, %3 + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u64 + %endif ; RT_ARCH_AMD64 + +%endmacro + +IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0 +IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0 +IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0 +IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0 +IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF) +IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF) +IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF) + + +;; +; Macro for implementing a double precision shift operation. +; +; This will generate code for the 16, 32 and 64 bit accesses, except on +; 32-bit system where the 64-bit accesses requires hand coding. +; +; The functions takes the destination operand (r/m) in A0, the source (reg) in +; A1, the shift count in A2 and a pointer to the eflags variable/register in A3. +; +; @param 1 The instruction mnemonic. +; @param 2 The modified flags. +; @param 3 The undefined flags. +; +; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments. +; +%macro IEMIMPL_SHIFT_DBL_OP 3 +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16 + PROLOGUE_4_ARGS + IEM_MAYBE_LOAD_FLAGS A3, %2, %3 + %ifdef ASM_CALL64_GCC + xchg A3, A2 + %1 [A0], A1_16, cl + xchg A3, A2 + %else + xchg A0, A2 + %1 [A2], A1_16, cl + %endif + IEM_SAVE_FLAGS A3, %2, %3 + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u16 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16 + PROLOGUE_4_ARGS + IEM_MAYBE_LOAD_FLAGS A3, %2, %3 + %ifdef ASM_CALL64_GCC + xchg A3, A2 + %1 [A0], A1_32, cl + xchg A3, A2 + %else + xchg A0, A2 + %1 [A2], A1_32, cl + %endif + IEM_SAVE_FLAGS A3, %2, %3 + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u32 + + %ifdef RT_ARCH_AMD64 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20 + PROLOGUE_4_ARGS + IEM_MAYBE_LOAD_FLAGS A3, %2, %3 + %ifdef ASM_CALL64_GCC + xchg A3, A2 + %1 [A0], A1, cl + xchg A3, A2 + %else + xchg A0, A2 + %1 [A2], A1, cl + %endif + IEM_SAVE_FLAGS A3, %2, %3 + EPILOGUE_4_ARGS_EX 12 +ENDPROC iemAImpl_ %+ %1 %+ _u64 + %endif ; RT_ARCH_AMD64 + +%endmacro + +IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF) +IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF) + + +;; +; Macro for implementing a multiplication operations. +; +; This will generate code for the 8, 16, 32 and 64 bit accesses, except on +; 32-bit system where the 64-bit accesses requires hand coding. +; +; The 8-bit function only operates on AX, so it takes no DX pointer. The other +; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a +; pointer to eflags in A3. +; +; The functions all return 0 so the caller can be used for div/idiv as well as +; for the mul/imul implementation. +; +; @param 1 The instruction mnemonic. +; @param 2 The modified flags. +; @param 3 The undefined flags. +; +; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments. +; +%macro IEMIMPL_MUL_OP 3 +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12 + PROLOGUE_3_ARGS + IEM_MAYBE_LOAD_FLAGS A2, %2, %3 + mov al, [A0] + %1 A1_8 + mov [A0], ax + IEM_SAVE_FLAGS A2, %2, %3 + xor eax, eax + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u8 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16 + PROLOGUE_4_ARGS + IEM_MAYBE_LOAD_FLAGS A3, %2, %3 + mov ax, [A0] + %ifdef ASM_CALL64_GCC + %1 A2_16 + mov [A0], ax + mov [A1], dx + %else + mov T1, A1 + %1 A2_16 + mov [A0], ax + mov [T1], dx + %endif + IEM_SAVE_FLAGS A3, %2, %3 + xor eax, eax + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u16 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16 + PROLOGUE_4_ARGS + IEM_MAYBE_LOAD_FLAGS A3, %2, %3 + mov eax, [A0] + %ifdef ASM_CALL64_GCC + %1 A2_32 + mov [A0], eax + mov [A1], edx + %else + mov T1, A1 + %1 A2_32 + mov [A0], eax + mov [T1], edx + %endif + IEM_SAVE_FLAGS A3, %2, %3 + xor eax, eax + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u32 + + %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp. +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20 + PROLOGUE_4_ARGS + IEM_MAYBE_LOAD_FLAGS A3, %2, %3 + mov rax, [A0] + %ifdef ASM_CALL64_GCC + %1 A2 + mov [A0], rax + mov [A1], rdx + %else + mov T1, A1 + %1 A2 + mov [A0], rax + mov [T1], rdx + %endif + IEM_SAVE_FLAGS A3, %2, %3 + xor eax, eax + EPILOGUE_4_ARGS_EX 12 +ENDPROC iemAImpl_ %+ %1 %+ _u64 + %endif ; !RT_ARCH_AMD64 + +%endmacro + +IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) +IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF) + + +BEGINCODE +;; +; Worker function for negating a 32-bit number in T1:T0 +; @uses None (T0,T1) +iemAImpl_negate_T0_T1_u32: + push 0 + push 0 + xchg T0_32, [xSP] + xchg T1_32, [xSP + xCB] + sub T0_32, [xSP] + sbb T1_32, [xSP + xCB] + add xSP, xCB*2 + ret + +%ifdef RT_ARCH_AMD64 +;; +; Worker function for negating a 64-bit number in T1:T0 +; @uses None (T0,T1) +iemAImpl_negate_T0_T1_u64: + push 0 + push 0 + xchg T0, [xSP] + xchg T1, [xSP + xCB] + sub T0, [xSP] + sbb T1, [xSP + xCB] + add xSP, xCB*2 + ret +%endif + + +;; +; Macro for implementing a division operations. +; +; This will generate code for the 8, 16, 32 and 64 bit accesses, except on +; 32-bit system where the 64-bit accesses requires hand coding. +; +; The 8-bit function only operates on AX, so it takes no DX pointer. The other +; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a +; pointer to eflags in A3. +; +; The functions all return 0 on success and -1 if a divide error should be +; raised by the caller. +; +; @param 1 The instruction mnemonic. +; @param 2 The modified flags. +; @param 3 The undefined flags. +; @param 4 1 if signed, 0 if unsigned. +; +; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments. +; +%macro IEMIMPL_DIV_OP 4 +BEGINCODE +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12 + PROLOGUE_3_ARGS + + ; div by chainsaw check. + test A1_8, A1_8 + jz .div_zero + + ; Overflow check - unsigned division is simple to verify, haven't + ; found a simple way to check signed division yet unfortunately. + %if %4 == 0 + cmp [A0 + 1], A1_8 + jae .div_overflow + %else + mov T0_16, [A0] ; T0 = dividend + mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit) + test A1_8, A1_8 + js .divisor_negative + test T0_16, T0_16 + jns .both_positive + neg T0_16 +.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1). + push T0 ; Start off like unsigned below. + shr T0_16, 7 + cmp T0_8, A1_8 + pop T0 + jb .div_no_overflow + ja .div_overflow + and T0_8, 0x7f ; Special case for covering (divisor - 1). + cmp T0_8, A1_8 + jae .div_overflow + jmp .div_no_overflow + +.divisor_negative: + neg A1_8 + test T0_16, T0_16 + jns .one_of_each + neg T0_16 +.both_positive: ; Same as unsigned shifted by sign indicator bit. + shr T0_16, 7 + cmp T0_8, A1_8 + jae .div_overflow +.div_no_overflow: + mov A1, T1 ; restore divisor + %endif + + IEM_MAYBE_LOAD_FLAGS A2, %2, %3 + mov ax, [A0] + %1 A1_8 + mov [A0], ax + IEM_SAVE_FLAGS A2, %2, %3 + xor eax, eax + +.return: + EPILOGUE_3_ARGS + +.div_zero: +.div_overflow: + mov eax, -1 + jmp .return +ENDPROC iemAImpl_ %+ %1 %+ _u8 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16 + PROLOGUE_4_ARGS + + ; div by chainsaw check. + test A2_16, A2_16 + jz .div_zero + + ; Overflow check - unsigned division is simple to verify, haven't + ; found a simple way to check signed division yet unfortunately. + %if %4 == 0 + cmp [A1], A2_16 + jae .div_overflow + %else + mov T0_16, [A1] + shl T0_32, 16 + mov T0_16, [A0] ; T0 = dividend + mov T1, A2 ; T1 = divisor + test T1_16, T1_16 + js .divisor_negative + test T0_32, T0_32 + jns .both_positive + neg T0_32 +.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1). + push T0 ; Start off like unsigned below. + shr T0_32, 15 + cmp T0_16, T1_16 + pop T0 + jb .div_no_overflow + ja .div_overflow + and T0_16, 0x7fff ; Special case for covering (divisor - 1). + cmp T0_16, T1_16 + jae .div_overflow + jmp .div_no_overflow + +.divisor_negative: + neg T1_16 + test T0_32, T0_32 + jns .one_of_each + neg T0_32 +.both_positive: ; Same as unsigned shifted by sign indicator bit. + shr T0_32, 15 + cmp T0_16, T1_16 + jae .div_overflow +.div_no_overflow: + %endif + + IEM_MAYBE_LOAD_FLAGS A3, %2, %3 + %ifdef ASM_CALL64_GCC + mov T1, A2 + mov ax, [A0] + mov dx, [A1] + %1 T1_16 + mov [A0], ax + mov [A1], dx + %else + mov T1, A1 + mov ax, [A0] + mov dx, [T1] + %1 A2_16 + mov [A0], ax + mov [T1], dx + %endif + IEM_SAVE_FLAGS A3, %2, %3 + xor eax, eax + +.return: + EPILOGUE_4_ARGS + +.div_zero: +.div_overflow: + mov eax, -1 + jmp .return +ENDPROC iemAImpl_ %+ %1 %+ _u16 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16 + PROLOGUE_4_ARGS + + ; div by chainsaw check. + test A2_32, A2_32 + jz .div_zero + + ; Overflow check - unsigned division is simple to verify, haven't + ; found a simple way to check signed division yet unfortunately. + %if %4 == 0 + cmp [A1], A2_32 + jae .div_overflow + %else + push A2 ; save A2 so we modify it (we out of regs on x86). + mov T0_32, [A0] ; T0 = dividend low + mov T1_32, [A1] ; T1 = dividend high + test A2_32, A2_32 + js .divisor_negative + test T1_32, T1_32 + jns .both_positive + call iemAImpl_negate_T0_T1_u32 +.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1). + push T0 ; Start off like unsigned below. + shl T1_32, 1 + shr T0_32, 31 + or T1_32, T0_32 + cmp T1_32, A2_32 + pop T0 + jb .div_no_overflow + ja .div_overflow + and T0_32, 0x7fffffff ; Special case for covering (divisor - 1). + cmp T0_32, A2_32 + jae .div_overflow + jmp .div_no_overflow + +.divisor_negative: + neg A2_32 + test T1_32, T1_32 + jns .one_of_each + call iemAImpl_negate_T0_T1_u32 +.both_positive: ; Same as unsigned shifted by sign indicator bit. + shl T1_32, 1 + shr T0_32, 31 + or T1_32, T0_32 + cmp T1_32, A2_32 + jae .div_overflow +.div_no_overflow: + pop A2 + %endif + + IEM_MAYBE_LOAD_FLAGS A3, %2, %3 + mov eax, [A0] + %ifdef ASM_CALL64_GCC + mov T1, A2 + mov eax, [A0] + mov edx, [A1] + %1 T1_32 + mov [A0], eax + mov [A1], edx + %else + mov T1, A1 + mov eax, [A0] + mov edx, [T1] + %1 A2_32 + mov [A0], eax + mov [T1], edx + %endif + IEM_SAVE_FLAGS A3, %2, %3 + xor eax, eax + +.return: + EPILOGUE_4_ARGS + +.div_overflow: + %if %4 != 0 + pop A2 + %endif +.div_zero: + mov eax, -1 + jmp .return +ENDPROC iemAImpl_ %+ %1 %+ _u32 + + %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp. +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20 + PROLOGUE_4_ARGS + + test A2, A2 + jz .div_zero + %if %4 == 0 + cmp [A1], A2 + jae .div_overflow + %else + push A2 ; save A2 so we modify it (we out of regs on x86). + mov T0, [A0] ; T0 = dividend low + mov T1, [A1] ; T1 = dividend high + test A2, A2 + js .divisor_negative + test T1, T1 + jns .both_positive + call iemAImpl_negate_T0_T1_u64 +.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1). + push T0 ; Start off like unsigned below. + shl T1, 1 + shr T0, 63 + or T1, T0 + cmp T1, A2 + pop T0 + jb .div_no_overflow + ja .div_overflow + mov T1, 0x7fffffffffffffff + and T0, T1 ; Special case for covering (divisor - 1). + cmp T0, A2 + jae .div_overflow + jmp .div_no_overflow + +.divisor_negative: + neg A2 + test T1, T1 + jns .one_of_each + call iemAImpl_negate_T0_T1_u64 +.both_positive: ; Same as unsigned shifted by sign indicator bit. + shl T1, 1 + shr T0, 63 + or T1, T0 + cmp T1, A2 + jae .div_overflow +.div_no_overflow: + pop A2 + %endif + + IEM_MAYBE_LOAD_FLAGS A3, %2, %3 + mov rax, [A0] + %ifdef ASM_CALL64_GCC + mov T1, A2 + mov rax, [A0] + mov rdx, [A1] + %1 T1 + mov [A0], rax + mov [A1], rdx + %else + mov T1, A1 + mov rax, [A0] + mov rdx, [T1] + %1 A2 + mov [A0], rax + mov [T1], rdx + %endif + IEM_SAVE_FLAGS A3, %2, %3 + xor eax, eax + +.return: + EPILOGUE_4_ARGS_EX 12 + +.div_overflow: + %if %4 != 0 + pop A2 + %endif +.div_zero: + mov eax, -1 + jmp .return +ENDPROC iemAImpl_ %+ %1 %+ _u64 + %endif ; !RT_ARCH_AMD64 + +%endmacro + +IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0 +IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1 + + +; +; BSWAP. No flag changes. +; +; Each function takes one argument, pointer to the value to bswap +; (input/output). They all return void. +; +BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4 + PROLOGUE_1_ARGS + mov T0_32, [A0] ; just in case any of the upper bits are used. + db 66h + bswap T0_32 + mov [A0], T0_32 + EPILOGUE_1_ARGS +ENDPROC iemAImpl_bswap_u16 + +BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4 + PROLOGUE_1_ARGS + mov T0_32, [A0] + bswap T0_32 + mov [A0], T0_32 + EPILOGUE_1_ARGS +ENDPROC iemAImpl_bswap_u32 + +BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4 +%ifdef RT_ARCH_AMD64 + PROLOGUE_1_ARGS + mov T0, [A0] + bswap T0 + mov [A0], T0 + EPILOGUE_1_ARGS +%else + PROLOGUE_1_ARGS + mov T0, [A0] + mov T1, [A0 + 4] + bswap T0 + bswap T1 + mov [A0 + 4], T0 + mov [A0], T1 + EPILOGUE_1_ARGS +%endif +ENDPROC iemAImpl_bswap_u64 + + +;; +; Initialize the FPU for the actual instruction being emulated, this means +; loading parts of the guest's control word and status word. +; +; @uses 24 bytes of stack. +; @param 1 Expression giving the address of the FXSTATE of the guest. +; +%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1 + fnstenv [xSP] + + ; FCW - for exception, precision and rounding control. + movzx T0, word [%1 + X86FXSTATE.FCW] + and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK + mov [xSP + X86FSTENV32P.FCW], T0_16 + + ; FSW - for undefined C0, C1, C2, and C3. + movzx T1, word [%1 + X86FXSTATE.FSW] + and T1, X86_FSW_C_MASK + movzx T0, word [xSP + X86FSTENV32P.FSW] + and T0, X86_FSW_TOP_MASK + or T0, T1 + mov [xSP + X86FSTENV32P.FSW], T0_16 + + fldenv [xSP] +%endmacro + + +;; +; Need to move this as well somewhere better? +; +struc IEMFPURESULT + .r80Result resw 5 + .FSW resw 1 +endstruc + + +;; +; Need to move this as well somewhere better? +; +struc IEMFPURESULTTWO + .r80Result1 resw 5 + .FSW resw 1 + .r80Result2 resw 5 +endstruc + + +; +;---------------------- 16-bit signed integer operations ---------------------- +; + + +;; +; Converts a 16-bit floating point value to a 80-bit one (fpu register). +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the 16-bit floating point value to convert. +; +BEGINPROC_FASTCALL iemAImpl_fild_i16_to_r80, 12 + PROLOGUE_3_ARGS + sub xSP, 20h + + fninit + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fild word [A2] + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_3_ARGS +ENDPROC iemAImpl_fild_i16_to_r80 + + +;; +; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory). +; +; @param A0 FPU context (fxsave). +; @param A1 Where to return the output FSW. +; @param A2 Where to store the 16-bit signed integer value. +; @param A3 Pointer to the 80-bit value. +; +BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A3] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fistp word [A2] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_fist_r80_to_i16 + + +;; +; Store a 80-bit floating point value (register) as a 16-bit signed integer +; (memory) with truncation. +; +; @param A0 FPU context (fxsave). +; @param A1 Where to return the output FSW. +; @param A2 Where to store the 16-bit signed integer value. +; @param A3 Pointer to the 80-bit value. +; +BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A3] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fisttp dword [A2] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_fistt_r80_to_i16 + + +;; +; FPU instruction working on one 80-bit and one 16-bit signed integer value. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the 80-bit value. +; @param A3 Pointer to the 16-bit value. +; +%macro IEMIMPL_FPU_R80_BY_I16 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 word [A3] + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16 +%endmacro + +IEMIMPL_FPU_R80_BY_I16 fiadd +IEMIMPL_FPU_R80_BY_I16 fimul +IEMIMPL_FPU_R80_BY_I16 fisub +IEMIMPL_FPU_R80_BY_I16 fisubr +IEMIMPL_FPU_R80_BY_I16 fidiv +IEMIMPL_FPU_R80_BY_I16 fidivr + + +;; +; FPU instruction working on one 80-bit and one 16-bit signed integer value, +; only returning FSW. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Where to store the output FSW. +; @param A2 Pointer to the 80-bit value. +; @param A3 Pointer to the 64-bit value. +; +%macro IEMIMPL_FPU_R80_BY_I16_FSW 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 word [A3] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16 +%endmacro + +IEMIMPL_FPU_R80_BY_I16_FSW ficom + + + +; +;---------------------- 32-bit signed integer operations ---------------------- +; + + +;; +; Converts a 32-bit floating point value to a 80-bit one (fpu register). +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the 32-bit floating point value to convert. +; +BEGINPROC_FASTCALL iemAImpl_fild_i32_to_r80, 12 + PROLOGUE_3_ARGS + sub xSP, 20h + + fninit + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fild dword [A2] + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_3_ARGS +ENDPROC iemAImpl_fild_i32_to_r80 + + +;; +; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory). +; +; @param A0 FPU context (fxsave). +; @param A1 Where to return the output FSW. +; @param A2 Where to store the 32-bit signed integer value. +; @param A3 Pointer to the 80-bit value. +; +BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A3] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fistp dword [A2] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_fist_r80_to_i32 + + +;; +; Store a 80-bit floating point value (register) as a 32-bit signed integer +; (memory) with truncation. +; +; @param A0 FPU context (fxsave). +; @param A1 Where to return the output FSW. +; @param A2 Where to store the 32-bit signed integer value. +; @param A3 Pointer to the 80-bit value. +; +BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A3] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fisttp dword [A2] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_fistt_r80_to_i32 + + +;; +; FPU instruction working on one 80-bit and one 32-bit signed integer value. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the 80-bit value. +; @param A3 Pointer to the 32-bit value. +; +%macro IEMIMPL_FPU_R80_BY_I32 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 dword [A3] + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32 +%endmacro + +IEMIMPL_FPU_R80_BY_I32 fiadd +IEMIMPL_FPU_R80_BY_I32 fimul +IEMIMPL_FPU_R80_BY_I32 fisub +IEMIMPL_FPU_R80_BY_I32 fisubr +IEMIMPL_FPU_R80_BY_I32 fidiv +IEMIMPL_FPU_R80_BY_I32 fidivr + + +;; +; FPU instruction working on one 80-bit and one 32-bit signed integer value, +; only returning FSW. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Where to store the output FSW. +; @param A2 Pointer to the 80-bit value. +; @param A3 Pointer to the 64-bit value. +; +%macro IEMIMPL_FPU_R80_BY_I32_FSW 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 dword [A3] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32 +%endmacro + +IEMIMPL_FPU_R80_BY_I32_FSW ficom + + + +; +;---------------------- 64-bit signed integer operations ---------------------- +; + + +;; +; Converts a 64-bit floating point value to a 80-bit one (fpu register). +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the 64-bit floating point value to convert. +; +BEGINPROC_FASTCALL iemAImpl_fild_i64_to_r80, 12 + PROLOGUE_3_ARGS + sub xSP, 20h + + fninit + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fild qword [A2] + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_3_ARGS +ENDPROC iemAImpl_fild_i64_to_r80 + + +;; +; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory). +; +; @param A0 FPU context (fxsave). +; @param A1 Where to return the output FSW. +; @param A2 Where to store the 64-bit signed integer value. +; @param A3 Pointer to the 80-bit value. +; +BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A3] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fistp qword [A2] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_fist_r80_to_i64 + + +;; +; Store a 80-bit floating point value (register) as a 64-bit signed integer +; (memory) with truncation. +; +; @param A0 FPU context (fxsave). +; @param A1 Where to return the output FSW. +; @param A2 Where to store the 64-bit signed integer value. +; @param A3 Pointer to the 80-bit value. +; +BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A3] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fisttp qword [A2] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_fistt_r80_to_i64 + + + +; +;---------------------- 32-bit floating point operations ---------------------- +; + +;; +; Converts a 32-bit floating point value to a 80-bit one (fpu register). +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the 32-bit floating point value to convert. +; +BEGINPROC_FASTCALL iemAImpl_fld_r32_to_r80, 12 + PROLOGUE_3_ARGS + sub xSP, 20h + + fninit + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fld dword [A2] + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_3_ARGS +ENDPROC iemAImpl_fld_r32_to_r80 + + +;; +; Store a 80-bit floating point value (register) as a 32-bit one (memory). +; +; @param A0 FPU context (fxsave). +; @param A1 Where to return the output FSW. +; @param A2 Where to store the 32-bit value. +; @param A3 Pointer to the 80-bit value. +; +BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A3] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fst dword [A2] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_fst_r80_to_r32 + + +;; +; FPU instruction working on one 80-bit and one 32-bit floating point value. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the 80-bit value. +; @param A3 Pointer to the 32-bit value. +; +%macro IEMIMPL_FPU_R80_BY_R32 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 dword [A3] + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32 +%endmacro + +IEMIMPL_FPU_R80_BY_R32 fadd +IEMIMPL_FPU_R80_BY_R32 fmul +IEMIMPL_FPU_R80_BY_R32 fsub +IEMIMPL_FPU_R80_BY_R32 fsubr +IEMIMPL_FPU_R80_BY_R32 fdiv +IEMIMPL_FPU_R80_BY_R32 fdivr + + +;; +; FPU instruction working on one 80-bit and one 32-bit floating point value, +; only returning FSW. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Where to store the output FSW. +; @param A2 Pointer to the 80-bit value. +; @param A3 Pointer to the 64-bit value. +; +%macro IEMIMPL_FPU_R80_BY_R32_FSW 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 dword [A3] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32 +%endmacro + +IEMIMPL_FPU_R80_BY_R32_FSW fcom + + + +; +;---------------------- 64-bit floating point operations ---------------------- +; + +;; +; Converts a 64-bit floating point value to a 80-bit one (fpu register). +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the 64-bit floating point value to convert. +; +BEGINPROC_FASTCALL iemAImpl_fld_r64_to_r80, 12 + PROLOGUE_3_ARGS + sub xSP, 20h + + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fld qword [A2] + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_3_ARGS +ENDPROC iemAImpl_fld_r64_to_r80 + + +;; +; Store a 80-bit floating point value (register) as a 64-bit one (memory). +; +; @param A0 FPU context (fxsave). +; @param A1 Where to return the output FSW. +; @param A2 Where to store the 64-bit value. +; @param A3 Pointer to the 80-bit value. +; +BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A3] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fst qword [A2] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_fst_r80_to_r64 + + +;; +; FPU instruction working on one 80-bit and one 64-bit floating point value. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the 80-bit value. +; @param A3 Pointer to the 64-bit value. +; +%macro IEMIMPL_FPU_R80_BY_R64 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 qword [A3] + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64 +%endmacro + +IEMIMPL_FPU_R80_BY_R64 fadd +IEMIMPL_FPU_R80_BY_R64 fmul +IEMIMPL_FPU_R80_BY_R64 fsub +IEMIMPL_FPU_R80_BY_R64 fsubr +IEMIMPL_FPU_R80_BY_R64 fdiv +IEMIMPL_FPU_R80_BY_R64 fdivr + +;; +; FPU instruction working on one 80-bit and one 64-bit floating point value, +; only returning FSW. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Where to store the output FSW. +; @param A2 Pointer to the 80-bit value. +; @param A3 Pointer to the 64-bit value. +; +%macro IEMIMPL_FPU_R80_BY_R64_FSW 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 qword [A3] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64 +%endmacro + +IEMIMPL_FPU_R80_BY_R64_FSW fcom + + + +; +;---------------------- 80-bit floating point operations ---------------------- +; + +;; +; Loads a 80-bit floating point register value from memory. +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the 80-bit floating point value to load. +; +BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12 + PROLOGUE_3_ARGS + sub xSP, 20h + + fninit + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fld tword [A2] + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_3_ARGS +ENDPROC iemAImpl_fld_r80_from_r80 + + +;; +; Store a 80-bit floating point register to memory +; +; @param A0 FPU context (fxsave). +; @param A1 Where to return the output FSW. +; @param A2 Where to store the 80-bit value. +; @param A3 Pointer to the 80-bit register value. +; +BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A3] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + fstp tword [A2] + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_fst_r80_to_r80 + + +;; +; FPU instruction working on two 80-bit floating point values. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the first 80-bit value (ST0) +; @param A3 Pointer to the second 80-bit value (STn). +; +%macro IEMIMPL_FPU_R80_BY_R80 2 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A3] + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 %2 + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80 +%endmacro + +IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1} +IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1} +IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1} +IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1} +IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1} +IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1} +IEMIMPL_FPU_R80_BY_R80 fprem, {} +IEMIMPL_FPU_R80_BY_R80 fprem1, {} +IEMIMPL_FPU_R80_BY_R80 fscale, {} + + +;; +; FPU instruction working on two 80-bit floating point values, ST1 and ST0, +; storing the result in ST1 and popping the stack. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the first 80-bit value (ST1). +; @param A3 Pointer to the second 80-bit value (ST0). +; +%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A2] + fld tword [A3] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80 +%endmacro + +IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan +IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x +IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1 + + +;; +; FPU instruction working on two 80-bit floating point values, only +; returning FSW. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a uint16_t for the resulting FSW. +; @param A2 Pointer to the first 80-bit value. +; @param A3 Pointer to the second 80-bit value. +; +%macro IEMIMPL_FPU_R80_BY_R80_FSW 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A3] + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 st0, st1 + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80 +%endmacro + +IEMIMPL_FPU_R80_BY_R80_FSW fcom +IEMIMPL_FPU_R80_BY_R80_FSW fucom + + +;; +; FPU instruction working on two 80-bit floating point values, +; returning FSW and EFLAGS (eax). +; +; @param 1 The instruction +; +; @returns EFLAGS in EAX. +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a uint16_t for the resulting FSW. +; @param A2 Pointer to the first 80-bit value. +; @param A3 Pointer to the second 80-bit value. +; +%macro IEMIMPL_FPU_R80_BY_R80_EFL 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16 + PROLOGUE_4_ARGS + sub xSP, 20h + + fninit + fld tword [A3] + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 st1 + + fnstsw word [A1] + pushf + pop xAX + + fninit + add xSP, 20h + EPILOGUE_4_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80 +%endmacro + +IEMIMPL_FPU_R80_BY_R80_EFL fcomi +IEMIMPL_FPU_R80_BY_R80_EFL fucomi + + +;; +; FPU instruction working on one 80-bit floating point value. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; @param A2 Pointer to the 80-bit value. +; +%macro IEMIMPL_FPU_R80 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12 + PROLOGUE_3_ARGS + sub xSP, 20h + + fninit + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80 +%endmacro + +IEMIMPL_FPU_R80 fchs +IEMIMPL_FPU_R80 fabs +IEMIMPL_FPU_R80 f2xm1 +IEMIMPL_FPU_R80 fsqrt +IEMIMPL_FPU_R80 frndint +IEMIMPL_FPU_R80 fsin +IEMIMPL_FPU_R80 fcos + + +;; +; FPU instruction working on one 80-bit floating point value, only +; returning FSW. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a uint16_t for the resulting FSW. +; @param A2 Pointer to the 80-bit value. +; +%macro IEMIMPL_FPU_R80_FSW 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12 + PROLOGUE_3_ARGS + sub xSP, 20h + + fninit + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 + + fnstsw word [A1] + + fninit + add xSP, 20h + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80 +%endmacro + +IEMIMPL_FPU_R80_FSW ftst +IEMIMPL_FPU_R80_FSW fxam + + + +;; +; FPU instruction loading a 80-bit floating point constant. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULT for the output. +; +%macro IEMIMPL_FPU_R80_CONST 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8 + PROLOGUE_2_ARGS + sub xSP, 20h + + fninit + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 + + fnstsw word [A1 + IEMFPURESULT.FSW] + fnclex + fstp tword [A1 + IEMFPURESULT.r80Result] + + fninit + add xSP, 20h + EPILOGUE_2_ARGS +ENDPROC iemAImpl_ %+ %1 %+ +%endmacro + +IEMIMPL_FPU_R80_CONST fld1 +IEMIMPL_FPU_R80_CONST fldl2t +IEMIMPL_FPU_R80_CONST fldl2e +IEMIMPL_FPU_R80_CONST fldpi +IEMIMPL_FPU_R80_CONST fldlg2 +IEMIMPL_FPU_R80_CONST fldln2 +IEMIMPL_FPU_R80_CONST fldz + + +;; +; FPU instruction working on one 80-bit floating point value, outputing two. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to a IEMFPURESULTTWO for the output. +; @param A2 Pointer to the 80-bit value. +; +%macro IEMIMPL_FPU_R80_R80 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12 + PROLOGUE_3_ARGS + sub xSP, 20h + + fninit + fld tword [A2] + FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0 + %1 + + fnstsw word [A1 + IEMFPURESULTTWO.FSW] + fnclex + fstp tword [A1 + IEMFPURESULTTWO.r80Result2] + fnclex + fstp tword [A1 + IEMFPURESULTTWO.r80Result1] + + fninit + add xSP, 20h + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _r80_r80 +%endmacro + +IEMIMPL_FPU_R80_R80 fptan +IEMIMPL_FPU_R80_R80 fxtract +IEMIMPL_FPU_R80_R80 fsincos + + + + +;---------------------- SSE and MMX Operations ---------------------- + +;; @todo what do we need to do for MMX? +%macro IEMIMPL_MMX_PROLOGUE 0 +%endmacro +%macro IEMIMPL_MMX_EPILOGUE 0 +%endmacro + +;; @todo what do we need to do for SSE? +%macro IEMIMPL_SSE_PROLOGUE 0 +%endmacro +%macro IEMIMPL_SSE_EPILOGUE 0 +%endmacro + + +;; +; Media instruction working on two full sized registers. +; +; @param 1 The instruction +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to the first media register size operand (input/output). +; @param A2 Pointer to the second media register size operand (input). +; +%macro IEMIMPL_MEDIA_F2 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12 + PROLOGUE_3_ARGS + IEMIMPL_MMX_PROLOGUE + + movq mm0, [A1] + movq mm1, [A2] + %1 mm0, mm1 + movq [A1], mm0 + + IEMIMPL_MMX_EPILOGUE + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u64 + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12 + PROLOGUE_3_ARGS + IEMIMPL_SSE_PROLOGUE + + movdqu xmm0, [A1] + movdqu xmm1, [A2] + %1 xmm0, xmm1 + movdqu [A1], xmm0 + + IEMIMPL_SSE_EPILOGUE + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u128 +%endmacro + +IEMIMPL_MEDIA_F2 pxor +IEMIMPL_MEDIA_F2 pcmpeqb +IEMIMPL_MEDIA_F2 pcmpeqw +IEMIMPL_MEDIA_F2 pcmpeqd + + +;; +; Media instruction working on one full sized and one half sized register (lower half). +; +; @param 1 The instruction +; @param 2 1 if MMX is included, 0 if not. +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to the first full sized media register operand (input/output). +; @param A2 Pointer to the second half sized media register operand (input). +; +%macro IEMIMPL_MEDIA_F1L1 2 + %if %2 != 0 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12 + PROLOGUE_3_ARGS + IEMIMPL_MMX_PROLOGUE + + movq mm0, [A1] + movd mm1, [A2] + %1 mm0, mm1 + movq [A1], mm0 + + IEMIMPL_MMX_EPILOGUE + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u64 + %endif + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12 + PROLOGUE_3_ARGS + IEMIMPL_SSE_PROLOGUE + + movdqu xmm0, [A1] + movq xmm1, [A2] + %1 xmm0, xmm1 + movdqu [A1], xmm0 + + IEMIMPL_SSE_EPILOGUE + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u128 +%endmacro + +IEMIMPL_MEDIA_F1L1 punpcklbw, 1 +IEMIMPL_MEDIA_F1L1 punpcklwd, 1 +IEMIMPL_MEDIA_F1L1 punpckldq, 1 +IEMIMPL_MEDIA_F1L1 punpcklqdq, 0 + + +;; +; Media instruction working on one full sized and one half sized register (high half). +; +; @param 1 The instruction +; @param 2 1 if MMX is included, 0 if not. +; +; @param A0 FPU context (fxsave). +; @param A1 Pointer to the first full sized media register operand (input/output). +; @param A2 Pointer to the second full sized media register operand, where we +; will only use the upper half (input). +; +%macro IEMIMPL_MEDIA_F1H1 2 + %if %2 != 0 +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12 + PROLOGUE_3_ARGS + IEMIMPL_MMX_PROLOGUE + + movq mm0, [A1] + movq mm1, [A2] + %1 mm0, mm1 + movq [A1], mm0 + + IEMIMPL_MMX_EPILOGUE + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u64 + %endif + +BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12 + PROLOGUE_3_ARGS + IEMIMPL_SSE_PROLOGUE + + movdqu xmm0, [A1] + movdqu xmm1, [A2] + %1 xmm0, xmm1 + movdqu [A1], xmm0 + + IEMIMPL_SSE_EPILOGUE + EPILOGUE_3_ARGS +ENDPROC iemAImpl_ %+ %1 %+ _u128 +%endmacro + +IEMIMPL_MEDIA_F1L1 punpckhbw, 1 +IEMIMPL_MEDIA_F1L1 punpckhwd, 1 +IEMIMPL_MEDIA_F1L1 punpckhdq, 1 +IEMIMPL_MEDIA_F1L1 punpckhqdq, 0 + + +; +; Shufflers with evil 8-bit immediates. +; + +BEGINPROC_FASTCALL iemAImpl_pshufw, 16 + PROLOGUE_4_ARGS + IEMIMPL_MMX_PROLOGUE + + movq mm0, [A1] + movq mm1, [A2] + lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5 + lea T1, [.imm0 xWrtRIP] + lea T1, [T1 + T0] + call T1 + movq [A1], mm0 + + IEMIMPL_MMX_EPILOGUE + EPILOGUE_4_ARGS +%assign bImm 0 +%rep 256 +.imm %+ bImm: + pshufw mm0, mm1, bImm + ret + %assign bImm bImm + 1 +%endrep +.immEnd: ; 256*5 == 0x500 +dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big. +dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big. +ENDPROC iemAImpl_pshufw + + +%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1 +BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16 + PROLOGUE_4_ARGS + IEMIMPL_SSE_PROLOGUE + + movdqu xmm0, [A1] + movdqu xmm1, [A2] + lea T1, [.imm0 xWrtRIP] + lea T0, [A3 + A3*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2 + lea T1, [T1 + T0*2] + call T1 + movdqu [A1], xmm0 + + IEMIMPL_SSE_EPILOGUE + EPILOGUE_4_ARGS + %assign bImm 0 + %rep 256 +.imm %+ bImm: + %1 xmm0, xmm1, bImm + ret + %assign bImm bImm + 1 + %endrep +.immEnd: ; 256*6 == 0x600 +dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big. +dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big. +ENDPROC iemAImpl_ %+ %1 +%endmacro + +IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw +IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw +IEMIMPL_MEDIA_SSE_PSHUFXX pshufd + + +; +; Move byte mask. +; + +BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12 + PROLOGUE_3_ARGS + IEMIMPL_MMX_PROLOGUE + + mov T0, [A1] + movq mm1, [A2] + pmovmskb T0, mm1 + mov [A1], T0 +%ifdef RT_ARCH_X86 + mov dword [A1 + 4], 0 +%endif + IEMIMPL_MMX_EPILOGUE + EPILOGUE_3_ARGS +ENDPROC iemAImpl_pmovmskb_u64 + +BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12 + PROLOGUE_3_ARGS + IEMIMPL_SSE_PROLOGUE + + mov T0, [A1] + movdqu xmm1, [A2] + pmovmskb T0, xmm1 + mov [A1], T0 +%ifdef RT_ARCH_X86 + mov dword [A1 + 4], 0 +%endif + IEMIMPL_SSE_EPILOGUE + EPILOGUE_3_ARGS +ENDPROC iemAImpl_pmovmskb_u128 + |