diff options
Diffstat (limited to 'src/VBox/Runtime/common/math/bignum-amd64-x86.asm')
-rw-r--r-- | src/VBox/Runtime/common/math/bignum-amd64-x86.asm | 881 |
1 files changed, 881 insertions, 0 deletions
diff --git a/src/VBox/Runtime/common/math/bignum-amd64-x86.asm b/src/VBox/Runtime/common/math/bignum-amd64-x86.asm new file mode 100644 index 00000000..1b3754fd --- /dev/null +++ b/src/VBox/Runtime/common/math/bignum-amd64-x86.asm @@ -0,0 +1,881 @@ +; $Id: bignum-amd64-x86.asm $ +;; @file +; IPRT - Big Integer Numbers, AMD64 and X86 Assembly Workers +; + +; +; Copyright (C) 2006-2019 Oracle Corporation +; +; This file is part of VirtualBox Open Source Edition (OSE), as +; available from http://www.virtualbox.org. This file is free software; +; you can redistribute it and/or modify it under the terms of the GNU +; General Public License (GPL) as published by the Free Software +; Foundation, in version 2 as it comes in the "COPYING" file of the +; VirtualBox OSE distribution. VirtualBox OSE is distributed in the +; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. +; +; The contents of this file may alternatively be used under the terms +; of the Common Development and Distribution License Version 1.0 +; (CDDL) only, as it comes in the "COPYING.CDDL" file of the +; VirtualBox OSE distribution, in which case the provisions of the +; CDDL are applicable instead of those of the GPL. +; +; You may elect to license modified versions of this file under the +; terms and conditions of either the GPL or the CDDL or both. +; + + +;********************************************************************************************************************************* +;* Header Files * +;********************************************************************************************************************************* +%define RT_ASM_WITH_SEH64 +%include "iprt/asmdefs.mac" +%include "internal/bignum.mac" + + +;********************************************************************************************************************************* +;* Defined Constants And Macros * +;********************************************************************************************************************************* +%ifdef RT_ARCH_AMD64 + %macro sahf 0 + %error "SAHF not supported on ancient AMD64" + %endmacro + %macro lahf 0 + %error "LAHF not supported on ancient AMD64" + %endmacro +%endif + + +BEGINCODE + +;; +; Subtracts a number (pauSubtrahend) from a larger number (pauMinuend) and +; stores the result in pauResult. +; +; All three numbers are zero padded such that a borrow can be carried one (or +; two for 64-bit) elements beyond the end of the largest number. +; +; @returns nothing. +; @param pauResult x86:[ebp + 8] gcc:rdi msc:rcx +; @param pauMinuend x86:[ebp + 12] gcc:rsi msc:rdx +; @param pauSubtrahend x86:[ebp + 16] gcc:rdx msc:r8 +; @param cUsed x86:[ebp + 20] gcc:rcx msc:r9 +; +BEGINPROC rtBigNumMagnitudeSubAssemblyWorker + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + %define pauResult rdi + %define pauMinuend rsi + %define pauSubtrahend rdx + %define cUsed ecx + %else + %define pauResult rcx + %define pauMinuend rdx + %define pauSubtrahend r8 + %define cUsed r9d + %endif + xor r11d, r11d ; index register. + + %if RTBIGNUM_ELEMENT_SIZE == 4 + add cUsed, 1 ; cUsed = RT_ALIGN(cUsed, 2) / 2 + shr cUsed, 1 + %endif + cmp cUsed, 8 ; Skip the big loop if small number. + jb .small_job + + mov r10d, cUsed + shr r10d, 3 + clc +.big_loop: + mov rax, [pauMinuend + r11] + sbb rax, [pauSubtrahend + r11] + mov [pauResult + r11], rax + mov rax, [pauMinuend + r11 + 8] + sbb rax, [pauSubtrahend + r11 + 8] + mov [pauResult + r11 + 8], rax + mov rax, [pauMinuend + r11 + 16] + sbb rax, [pauSubtrahend + r11 + 16] + mov [pauResult + r11 + 16], rax + mov rax, [pauMinuend + r11 + 24] + sbb rax, [pauSubtrahend + r11 + 24] + mov [pauResult + r11 + 24], rax + mov rax, [pauMinuend + r11 + 32] + sbb rax, [pauSubtrahend + r11 + 32] + mov [pauResult + r11 + 32], rax + mov rax, [pauMinuend + r11 + 40] + sbb rax, [pauSubtrahend + r11 + 40] + mov [pauResult + r11 + 40], rax + mov rax, [pauMinuend + r11 + 48] + sbb rax, [pauSubtrahend + r11 + 48] + mov [pauResult + r11 + 48], rax + mov rax, [pauMinuend + r11 + 56] + sbb rax, [pauSubtrahend + r11 + 56] + mov [pauResult + r11 + 56], rax + lea r11, [r11 + 64] + dec r10d ; Does not change CF. + jnz .big_loop + + %if 0 ; Ancient AMD CPUs does have lahf/sahf, thus the mess in the %else. + lahf ; Save CF + and cUsed, 7 ; Up to seven odd rounds. + jz .done + sahf ; Restore CF. + jmp .small_loop ; Skip CF=1 (clc). + %else + jnc .no_carry + and cUsed, 7 ; Up to seven odd rounds. + jz .done + stc + jmp .small_loop ; Skip CF=1 (clc). +.no_carry: + and cUsed, 7 ; Up to seven odd rounds. + jz .done + %endif +.small_job: + clc +.small_loop: + mov rax, [pauMinuend + r11] + sbb rax, [pauSubtrahend + r11] + mov [pauResult + r11], rax + lea r11, [r11 + 8] + dec cUsed ; does not change CF. + jnz .small_loop + %ifdef RT_STRICT + jnc .done + int3 + %endif +.done: + +%elifdef RT_ARCH_X86 + push edi + push esi + push ebx + + mov edi, [ebp + 08h] ; pauResult + %define pauResult edi + mov ecx, [ebp + 0ch] ; pauMinuend + %define pauMinuend ecx + mov edx, [ebp + 10h] ; pauSubtrahend + %define pauSubtrahend edx + mov esi, [ebp + 14h] ; cUsed + %define cUsed esi + + xor ebx, ebx ; index register. + + cmp cUsed, 8 ; Skip the big loop if small number. + jb .small_job + + shr cUsed, 3 + clc +.big_loop: + mov eax, [pauMinuend + ebx] + sbb eax, [pauSubtrahend + ebx] + mov [pauResult + ebx], eax + mov eax, [pauMinuend + ebx + 4] + sbb eax, [pauSubtrahend + ebx + 4] + mov [pauResult + ebx + 4], eax + mov eax, [pauMinuend + ebx + 8] + sbb eax, [pauSubtrahend + ebx + 8] + mov [pauResult + ebx + 8], eax + mov eax, [pauMinuend + ebx + 12] + sbb eax, [pauSubtrahend + ebx + 12] + mov [pauResult + ebx + 12], eax + mov eax, [pauMinuend + ebx + 16] + sbb eax, [pauSubtrahend + ebx + 16] + mov [pauResult + ebx + 16], eax + mov eax, [pauMinuend + ebx + 20] + sbb eax, [pauSubtrahend + ebx + 20] + mov [pauResult + ebx + 20], eax + mov eax, [pauMinuend + ebx + 24] + sbb eax, [pauSubtrahend + ebx + 24] + mov [pauResult + ebx + 24], eax + mov eax, [pauMinuend + ebx + 28] + sbb eax, [pauSubtrahend + ebx + 28] + mov [pauResult + ebx + 28], eax + lea ebx, [ebx + 32] + dec cUsed ; Does not change CF. + jnz .big_loop + + lahf ; Save CF + mov cUsed, [ebp + 14h] ; Up to three final rounds. + and cUsed, 7 + jz .done + sahf ; Restore CF. + jmp .small_loop ; Skip CF=1 (clc). + +.small_job: + clc +.small_loop: + mov eax, [pauMinuend + ebx] + sbb eax, [pauSubtrahend + ebx] + mov [pauResult + ebx], eax + lea ebx, [ebx + 4] + dec cUsed ; Does not change CF + jnz .small_loop + %ifdef RT_STRICT + jnc .done + int3 + %endif +.done: + + pop ebx + pop esi + pop edi +%else + %error "Unsupported arch" +%endif + + leave + ret +%undef pauResult +%undef pauMinuend +%undef pauSubtrahend +%undef cUsed +ENDPROC rtBigNumMagnitudeSubAssemblyWorker + + + +;; +; Subtracts a number (pauSubtrahend) from a larger number (pauMinuend) and +; stores the result in pauResult. +; +; All three numbers are zero padded such that a borrow can be carried one (or +; two for 64-bit) elements beyond the end of the largest number. +; +; @returns nothing. +; @param pauResultMinuend x86:[ebp + 8] gcc:rdi msc:rcx +; @param pauSubtrahend x86:[ebp + 12] gcc:rsi msc:rdx +; @param cUsed x86:[ebp + 16] gcc:rdx msc:r8 +; +BEGINPROC rtBigNumMagnitudeSubThisAssemblyWorker + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + %define pauResultMinuend rdi + %define pauSubtrahend rsi + %define cUsed edx + %else + %define pauResultMinuend rcx + %define pauSubtrahend rdx + %define cUsed r8d + %endif + xor r11d, r11d ; index register. + + %if RTBIGNUM_ELEMENT_SIZE == 4 + add cUsed, 1 ; cUsed = RT_ALIGN(cUsed, 2) / 2 + shr cUsed, 1 + %endif + cmp cUsed, 8 ; Skip the big loop if small number. + jb .small_job + + mov r10d, cUsed + shr r10d, 3 + clc +.big_loop: + mov rax, [pauSubtrahend + r11] + sbb [pauResultMinuend + r11], rax + mov rax, [pauSubtrahend + r11 + 8] + sbb [pauResultMinuend + r11 + 8], rax + mov rax, [pauSubtrahend + r11 + 16] + sbb [pauResultMinuend + r11 + 16], rax + mov rax, [pauSubtrahend + r11 + 24] + sbb [pauResultMinuend + r11 + 24], rax + mov rax, [pauSubtrahend + r11 + 32] + sbb [pauResultMinuend + r11 + 32], rax + mov rax, [pauSubtrahend + r11 + 40] + sbb [pauResultMinuend + r11 + 40], rax + mov rax, [pauSubtrahend + r11 + 48] + sbb [pauResultMinuend + r11 + 48], rax + mov rax, [pauSubtrahend + r11 + 56] + sbb [pauResultMinuend + r11 + 56], rax + lea r11, [r11 + 64] + dec r10d ; Does not change CF. + jnz .big_loop + + %if 0 ; Ancient AMD CPUs does have lahf/sahf, thus the mess in the %else. + lahf ; Save CF + and cUsed, 7 ; Up to seven odd rounds. + jz .done + sahf ; Restore CF. + jmp .small_loop ; Skip CF=1 (clc). + %else + jnc .no_carry + and cUsed, 7 ; Up to seven odd rounds. + jz .done + stc + jmp .small_loop ; Skip CF=1 (clc). +.no_carry: + and cUsed, 7 ; Up to seven odd rounds. + jz .done + %endif +.small_job: + clc +.small_loop: + mov rax, [pauSubtrahend + r11] + sbb [pauResultMinuend + r11], rax + lea r11, [r11 + 8] + dec cUsed ; does not change CF. + jnz .small_loop + %ifdef RT_STRICT + jnc .done + int3 + %endif +.done: + +%elifdef RT_ARCH_X86 + push edi + push ebx + + mov edi, [ebp + 08h] ; pauResultMinuend + %define pauResultMinuend edi + mov edx, [ebp + 0ch] ; pauSubtrahend + %define pauSubtrahend edx + mov ecx, [ebp + 10h] ; cUsed + %define cUsed ecx + + xor ebx, ebx ; index register. + + cmp cUsed, 8 ; Skip the big loop if small number. + jb .small_job + + shr cUsed, 3 + clc +.big_loop: + mov eax, [pauSubtrahend + ebx] + sbb [pauResultMinuend + ebx], eax + mov eax, [pauSubtrahend + ebx + 4] + sbb [pauResultMinuend + ebx + 4], eax + mov eax, [pauSubtrahend + ebx + 8] + sbb [pauResultMinuend + ebx + 8], eax + mov eax, [pauSubtrahend + ebx + 12] + sbb [pauResultMinuend + ebx + 12], eax + mov eax, [pauSubtrahend + ebx + 16] + sbb [pauResultMinuend + ebx + 16], eax + mov eax, [pauSubtrahend + ebx + 20] + sbb [pauResultMinuend + ebx + 20], eax + mov eax, [pauSubtrahend + ebx + 24] + sbb [pauResultMinuend + ebx + 24], eax + mov eax, [pauSubtrahend + ebx + 28] + sbb [pauResultMinuend + ebx + 28], eax + lea ebx, [ebx + 32] + dec cUsed ; Does not change CF. + jnz .big_loop + + lahf ; Save CF + mov cUsed, [ebp + 10h] ; Up to seven odd rounds. + and cUsed, 7 + jz .done + sahf ; Restore CF. + jmp .small_loop ; Skip CF=1 (clc). + +.small_job: + clc +.small_loop: + mov eax, [pauSubtrahend + ebx] + sbb [pauResultMinuend + ebx], eax + lea ebx, [ebx + 4] + dec cUsed ; Does not change CF + jnz .small_loop + %ifdef RT_STRICT + jnc .done + int3 + %endif +.done: + + pop ebx + pop edi +%else + %error "Unsupported arch" +%endif + + leave + ret +ENDPROC rtBigNumMagnitudeSubThisAssemblyWorker + + +;; +; Shifts an element array one bit to the left, returning the final carry value. +; +; On 64-bit hosts the array is always zero padded to a multiple of 8 bytes, so +; we can use 64-bit operand sizes even if the element type is 32-bit. +; +; @returns The final carry value. +; @param pauElements x86:[ebp + 8] gcc:rdi msc:rcx +; @param cUsed x86:[ebp + 12] gcc:rsi msc:rdx +; @param uCarry x86:[ebp + 16] gcc:rdx msc:r8 +; +BEGINPROC rtBigNumMagnitudeShiftLeftOneAssemblyWorker + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + %define pauElements rdi + %define cUsed esi + %define uCarry edx + %else + %define pauElements rcx + %define cUsed edx + %define uCarry r8d + %endif +%elifdef RT_ARCH_X86 + %define pauElements ecx + mov pauElements, [ebp + 08h] + %define cUsed edx + mov cUsed, [ebp + 0ch] + %define uCarry eax + mov uCarry, [ebp + 10h] +%else + %error "Unsupported arch." +%endif + ; Lots to do? + cmp cUsed, 8 + jae .big_loop_init + + ; Check for empty array. + test cUsed, cUsed + jz .no_elements + jmp .small_loop_init + + ; Big loop - 8 unrolled loop iterations. +.big_loop_init: +%ifdef RT_ARCH_AMD64 + mov r11d, cUsed +%endif + shr cUsed, 3 + test uCarry, uCarry ; clear the carry flag + jz .big_loop + stc +.big_loop: +%if RTBIGNUM_ELEMENT_SIZE == 8 + rcl qword [pauElements], 1 + rcl qword [pauElements + 8], 1 + rcl qword [pauElements + 16], 1 + rcl qword [pauElements + 24], 1 + rcl qword [pauElements + 32], 1 + rcl qword [pauElements + 40], 1 + rcl qword [pauElements + 48], 1 + rcl qword [pauElements + 56], 1 + lea pauElements, [pauElements + 64] +%else + rcl dword [pauElements], 1 + rcl dword [pauElements + 4], 1 + rcl dword [pauElements + 8], 1 + rcl dword [pauElements + 12], 1 + rcl dword [pauElements + 16], 1 + rcl dword [pauElements + 20], 1 + rcl dword [pauElements + 24], 1 + rcl dword [pauElements + 28], 1 + lea pauElements, [pauElements + 32] +%endif + dec cUsed + jnz .big_loop + + ; More to do? + pushf ; save carry flag (uCarry no longer used on x86). +%ifdef RT_ARCH_AMD64 + mov cUsed, r11d +%else + mov cUsed, [ebp + 0ch] +%endif + and cUsed, 7 + jz .restore_cf_and_return ; Jump if we're good and done. + popf ; Restore CF. + jmp .small_loop ; Deal with the odd rounds. +.restore_cf_and_return: + popf + jmp .carry_to_eax + + ; Small loop - One round at the time. +.small_loop_init: + test uCarry, uCarry ; clear the carry flag + jz .small_loop + stc +.small_loop: +%if RTBIGNUM_ELEMENT_SIZE == 8 + rcl qword [pauElements], 1 + lea pauElements, [pauElements + 8] +%else + rcl dword [pauElements], 1 + lea pauElements, [pauElements + 4] +%endif + dec cUsed + jnz .small_loop + + ; Calculate return value. +.carry_to_eax: + mov eax, 0 + jnc .return + inc eax +.return: + leave + ret + +.no_elements: + mov eax, uCarry + jmp .return +ENDPROC rtBigNumMagnitudeShiftLeftOneAssemblyWorker + + +;; +; Performs a 128-bit by 64-bit division on 64-bit and +; a 64-bit by 32-bit divison on 32-bit. +; +; @returns nothing. +; @param puQuotient x86:[ebp + 8] gcc:rdi msc:rcx Double element. +; @param puRemainder x86:[ebp + 12] gcc:rsi msc:rdx Normal element. +; @param uDividendHi x86:[ebp + 16] gcc:rdx msc:r8 +; @param uDividendLo x86:[ebp + 20] gcc:rcx msc:r9 +; @param uDivisior x86:[ebp + 24] gcc:r8 msc:[rbp + 30h] +; +BEGINPROC rtBigNumElement2xDiv2xBy1x + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + +%ifdef RT_ARCH_AMD64 + %if RTBIGNUM_ELEMENT_SIZE == 4 + %error "sorry not implemented yet." + sorry not implemented yet. + %endif + + %define uDividendHi rdx + %define uDividendLo rax + %ifdef ASM_CALL64_GCC + %define uDivisor r8 + %define puQuotient rdi + %define puRemainder rsi + mov rax, rcx + %else + %define puQuotient rcx + %define puRemainder r11 + %define uDivisor r10 + mov r11, rdx + mov r10, [rbp + 30h] + mov rdx, r8 + mov rax, r9 + %endif + +%elifdef RT_ARCH_X86 + push edi + push ebx + + %define uDividendHi edx + mov uDividendHi, [ebp + 10h] + %define uDividendLo eax + mov uDividendLo, [ebp + 14h] + %define uDivisor ecx + mov uDivisor, [ebp + 18h] + %define puQuotient edi + mov puQuotient, [ebp + 08h] + %define puRemainder ebx + mov puRemainder, [ebp + 0ch] +%else + %error "Unsupported arch." +%endif + +%ifdef RT_STRICT + ; + ; The dividend shall not be zero. + ; + test uDivisor, uDivisor + jnz .divisor_not_zero + int3 +.divisor_not_zero: +%endif + + ; + ; Avoid division overflow. This will calculate the high part of the quotient. + ; + mov RTBIGNUM_ELEMENT_PRE [puQuotient + RTBIGNUM_ELEMENT_SIZE], 0 + cmp uDividendHi, uDivisor + jb .do_divide + push xAX + mov xAX, xDX + xor edx, edx + div uDivisor + mov RTBIGNUM_ELEMENT_PRE [puQuotient + RTBIGNUM_ELEMENT_SIZE], xAX + pop xAX + + ; + ; Perform the division and store the result. + ; +.do_divide: + div uDivisor + mov RTBIGNUM_ELEMENT_PRE [puQuotient], xAX + mov RTBIGNUM_ELEMENT_PRE [puRemainder], xDX + + +%ifdef RT_ARCH_X86 + pop ebx + pop edi +%endif + leave + ret +ENDPROC rtBigNumElement2xDiv2xBy1x + + +;; +; Performs the core of long multiplication. +; +; @returns nothing. +; @param pauResult x86:[ebp + 8] gcc:rdi msc:rcx Initialized to zero. +; @param pauMultiplier x86:[ebp + 12] gcc:rsi msc:rdx +; @param cMultiplier x86:[ebp + 16] gcc:rdx msc:r8 +; @param pauMultiplicand x86:[ebp + 20] gcc:rcx msc:r9 +; @param cMultiplicand x86:[ebp + 24] gcc:r8 msc:[rbp + 30h] +; +BEGINPROC rtBigNumMagnitudeMultiplyAssemblyWorker + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + +%ifdef RT_ARCH_AMD64 + %if RTBIGNUM_ELEMENT_SIZE == 4 + %error "sorry not implemented yet." + sorry not implemented yet. + %endif + + %ifdef ASM_CALL64_GCC + %define pauResult rdi + %define pauMultiplier rsi + %define cMultiplier r9 + %define pauMultiplicand rcx + %define cMultiplicand r8 + mov r9d, edx ; cMultiplier + mov r8d, r8d ; cMultiplicand - paranoia + %define uMultiplier r10 + %define iMultiplicand r11 + %else + %define pauResult rcx + %define pauMultiplier r11 + %define cMultiplier r8 + %define pauMultiplicand r9 + %define cMultiplicand r10 + mov pauMultiplier, rdx + mov r10d, dword [rbp + 30h] ; cMultiplicand + mov r8d, r8d ; cMultiplier - paranoia + %define uMultiplier r12 + push r12 + %define iMultiplicand r13 + push r13 + %endif + +%elifdef RT_ARCH_X86 + push edi + push esi + push ebx + sub esp, 10h + %define pauResult edi + mov pauResult, [ebp + 08h] + %define pauMultiplier dword [ebp + 0ch] + %define cMultiplier dword [ebp + 10h] + %define pauMultiplicand ecx + mov pauMultiplicand, [ebp + 14h] + %define cMultiplicand dword [ebp + 18h] + %define uMultiplier dword [ebp - 10h] + %define iMultiplicand ebx + +%else + %error "Unsupported arch." +%endif + + ; + ; Check that the multiplicand isn't empty (avoids an extra jump in the inner loop). + ; + cmp cMultiplicand, 0 + je .done + + ; + ; Loop thru each element in the multiplier. + ; + ; while (cMultiplier-- > 0) +.multiplier_loop: + cmp cMultiplier, 0 + jz .done + dec cMultiplier + + ; uMultiplier = *pauMultiplier +%ifdef RT_ARCH_X86 + mov edx, pauMultiplier + mov eax, [edx] + mov uMultiplier, eax +%else + mov uMultiplier, [pauMultiplier] +%endif + ; for (iMultiplicand = 0; iMultiplicand < cMultiplicand; iMultiplicand++) + xor iMultiplicand, iMultiplicand +.multiplicand_loop: + mov xAX, [pauMultiplicand + iMultiplicand * RTBIGNUM_ELEMENT_SIZE] + mul uMultiplier + add [pauResult + iMultiplicand * RTBIGNUM_ELEMENT_SIZE], xAX + adc [pauResult + iMultiplicand * RTBIGNUM_ELEMENT_SIZE + RTBIGNUM_ELEMENT_SIZE], xDX + jnc .next_multiplicand + lea xDX, [iMultiplicand + 2] +.next_adc: + adc RTBIGNUM_ELEMENT_PRE [pauResult + xDX * RTBIGNUM_ELEMENT_SIZE], 0 + inc xDX + jc .next_adc + +.next_multiplicand: + inc iMultiplicand ; iMultiplicand++ + cmp iMultiplicand, cMultiplicand ; iMultiplicand < cMultiplicand + jb .multiplicand_loop + + ; Advance and loop on multiplier. + add pauMultiplier, RTBIGNUM_ELEMENT_SIZE + add pauResult, RTBIGNUM_ELEMENT_SIZE + jmp .multiplier_loop + +.done: + +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + %else + pop r13 + pop r12 + %endif +%elifdef RT_ARCH_X86 + add esp, 10h + pop ebx + pop esi + pop edi +%endif + leave + ret +ENDPROC rtBigNumMagnitudeMultiplyAssemblyWorker + +;; +; Assembly implementation of the D4 step of Knuth's division algorithm. +; +; This subtracts Divisor * Qhat from the dividend at the current J index. +; +; @returns true if negative result (unlikely), false if positive. +; @param pauDividendJ x86:[ebp + 8] gcc:rdi msc:rcx Initialized to zero. +; @param pauDivisor x86:[ebp + 12] gcc:rsi msc:rdx +; @param cDivisor x86:[ebp + 16] gcc:edx msc:r8d +; @param uQhat x86:[ebp + 16] gcc:rcx msc:r9 +; +BEGINPROC rtBigNumKnuthD4_MulSub + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + +%ifdef RT_ARCH_AMD64 + %if RTBIGNUM_ELEMENT_SIZE == 4 + %error "sorry not implemented yet." + sorry not implemented yet. + %endif + + %ifdef ASM_CALL64_GCC + %define pauDividendJ rdi + %define pauDivisor rsi + %define cDivisor r8 + %define uQhat rcx + mov r8d, edx ; cDivisor + %define uMulCarry r11 + %else + %define pauDividendJ rcx + %define pauDivisor r10 + %define cDivisor r8 + %define uQhat r9 + mov r10, rdx ; pauDivisor + mov r8d, r8d ; cDivisor - paranoia + %define uMulCarry r11 + %endif + +%elifdef RT_ARCH_X86 + push edi + push esi + push ebx + %define pauDividendJ edi + mov pauDividendJ, [ebp + 08h] + %define pauDivisor esi + mov pauDivisor, [ebp + 0ch] + %define cDivisor ecx + mov cDivisor, [ebp + 10h] + %define uQhat dword [ebp + 14h] + %define uMulCarry ebx +%else + %error "Unsupported arch." +%endif + +%ifdef RT_STRICT + ; + ; Some sanity checks. + ; + cmp cDivisor, 0 + jne .cDivisor_not_zero + int3 +.cDivisor_not_zero: +%endif + + ; + ; Initialize the loop. + ; + xor uMulCarry, uMulCarry + + ; + ; do ... while (cDivisor-- > 0); + ; +.the_loop: + ; RTUInt128MulU64ByU64(&uSub, uQhat, pauDivisor[i]); + mov xAX, uQhat + mul RTBIGNUM_ELEMENT_PRE [pauDivisor] + ; RTUInt128AssignAddU64(&uSub, uMulCarry); + add xAX, uMulCarry + adc xDX, 0 + mov uMulCarry, xDX + ; Subtract uSub.s.Lo+fCarry from pauDividendJ[i] + sub [pauDividendJ], xAX + adc uMulCarry, 0 +%ifdef RT_STRICT + jnc .uMulCarry_did_not_overflow + int3 +.uMulCarry_did_not_overflow +%endif + + ; Advance. + add pauDividendJ, RTBIGNUM_ELEMENT_SIZE + add pauDivisor, RTBIGNUM_ELEMENT_SIZE + dec cDivisor + jnz .the_loop + + ; + ; Final dividend element (no corresponding divisor element). + ; + sub [pauDividendJ], uMulCarry + sbb eax, eax + and eax, 1 + +.done: +%ifdef RT_ARCH_AMD64 +%elifdef RT_ARCH_X86 + pop ebx + pop esi + pop edi +%endif + leave + ret +ENDPROC rtBigNumKnuthD4_MulSub + |