summaryrefslogtreecommitdiffstats
path: root/src/VBox/Runtime/common/math/bignum-amd64-x86.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/VBox/Runtime/common/math/bignum-amd64-x86.asm')
-rw-r--r--src/VBox/Runtime/common/math/bignum-amd64-x86.asm881
1 files changed, 881 insertions, 0 deletions
diff --git a/src/VBox/Runtime/common/math/bignum-amd64-x86.asm b/src/VBox/Runtime/common/math/bignum-amd64-x86.asm
new file mode 100644
index 00000000..1b3754fd
--- /dev/null
+++ b/src/VBox/Runtime/common/math/bignum-amd64-x86.asm
@@ -0,0 +1,881 @@
+; $Id: bignum-amd64-x86.asm $
+;; @file
+; IPRT - Big Integer Numbers, AMD64 and X86 Assembly Workers
+;
+
+;
+; Copyright (C) 2006-2019 Oracle Corporation
+;
+; This file is part of VirtualBox Open Source Edition (OSE), as
+; available from http://www.virtualbox.org. This file is free software;
+; you can redistribute it and/or modify it under the terms of the GNU
+; General Public License (GPL) as published by the Free Software
+; Foundation, in version 2 as it comes in the "COPYING" file of the
+; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+;
+; The contents of this file may alternatively be used under the terms
+; of the Common Development and Distribution License Version 1.0
+; (CDDL) only, as it comes in the "COPYING.CDDL" file of the
+; VirtualBox OSE distribution, in which case the provisions of the
+; CDDL are applicable instead of those of the GPL.
+;
+; You may elect to license modified versions of this file under the
+; terms and conditions of either the GPL or the CDDL or both.
+;
+
+
+;*********************************************************************************************************************************
+;* Header Files *
+;*********************************************************************************************************************************
+%define RT_ASM_WITH_SEH64
+%include "iprt/asmdefs.mac"
+%include "internal/bignum.mac"
+
+
+;*********************************************************************************************************************************
+;* Defined Constants And Macros *
+;*********************************************************************************************************************************
+%ifdef RT_ARCH_AMD64
+ %macro sahf 0
+ %error "SAHF not supported on ancient AMD64"
+ %endmacro
+ %macro lahf 0
+ %error "LAHF not supported on ancient AMD64"
+ %endmacro
+%endif
+
+
+BEGINCODE
+
+;;
+; Subtracts a number (pauSubtrahend) from a larger number (pauMinuend) and
+; stores the result in pauResult.
+;
+; All three numbers are zero padded such that a borrow can be carried one (or
+; two for 64-bit) elements beyond the end of the largest number.
+;
+; @returns nothing.
+; @param pauResult x86:[ebp + 8] gcc:rdi msc:rcx
+; @param pauMinuend x86:[ebp + 12] gcc:rsi msc:rdx
+; @param pauSubtrahend x86:[ebp + 16] gcc:rdx msc:r8
+; @param cUsed x86:[ebp + 20] gcc:rcx msc:r9
+;
+BEGINPROC rtBigNumMagnitudeSubAssemblyWorker
+ push xBP
+ SEH64_PUSH_xBP
+ mov xBP, xSP
+ SEH64_SET_FRAME_xBP 0
+SEH64_END_PROLOGUE
+
+%ifdef RT_ARCH_AMD64
+ %ifdef ASM_CALL64_GCC
+ %define pauResult rdi
+ %define pauMinuend rsi
+ %define pauSubtrahend rdx
+ %define cUsed ecx
+ %else
+ %define pauResult rcx
+ %define pauMinuend rdx
+ %define pauSubtrahend r8
+ %define cUsed r9d
+ %endif
+ xor r11d, r11d ; index register.
+
+ %if RTBIGNUM_ELEMENT_SIZE == 4
+ add cUsed, 1 ; cUsed = RT_ALIGN(cUsed, 2) / 2
+ shr cUsed, 1
+ %endif
+ cmp cUsed, 8 ; Skip the big loop if small number.
+ jb .small_job
+
+ mov r10d, cUsed
+ shr r10d, 3
+ clc
+.big_loop:
+ mov rax, [pauMinuend + r11]
+ sbb rax, [pauSubtrahend + r11]
+ mov [pauResult + r11], rax
+ mov rax, [pauMinuend + r11 + 8]
+ sbb rax, [pauSubtrahend + r11 + 8]
+ mov [pauResult + r11 + 8], rax
+ mov rax, [pauMinuend + r11 + 16]
+ sbb rax, [pauSubtrahend + r11 + 16]
+ mov [pauResult + r11 + 16], rax
+ mov rax, [pauMinuend + r11 + 24]
+ sbb rax, [pauSubtrahend + r11 + 24]
+ mov [pauResult + r11 + 24], rax
+ mov rax, [pauMinuend + r11 + 32]
+ sbb rax, [pauSubtrahend + r11 + 32]
+ mov [pauResult + r11 + 32], rax
+ mov rax, [pauMinuend + r11 + 40]
+ sbb rax, [pauSubtrahend + r11 + 40]
+ mov [pauResult + r11 + 40], rax
+ mov rax, [pauMinuend + r11 + 48]
+ sbb rax, [pauSubtrahend + r11 + 48]
+ mov [pauResult + r11 + 48], rax
+ mov rax, [pauMinuend + r11 + 56]
+ sbb rax, [pauSubtrahend + r11 + 56]
+ mov [pauResult + r11 + 56], rax
+ lea r11, [r11 + 64]
+ dec r10d ; Does not change CF.
+ jnz .big_loop
+
+ %if 0 ; Ancient AMD CPUs does have lahf/sahf, thus the mess in the %else.
+ lahf ; Save CF
+ and cUsed, 7 ; Up to seven odd rounds.
+ jz .done
+ sahf ; Restore CF.
+ jmp .small_loop ; Skip CF=1 (clc).
+ %else
+ jnc .no_carry
+ and cUsed, 7 ; Up to seven odd rounds.
+ jz .done
+ stc
+ jmp .small_loop ; Skip CF=1 (clc).
+.no_carry:
+ and cUsed, 7 ; Up to seven odd rounds.
+ jz .done
+ %endif
+.small_job:
+ clc
+.small_loop:
+ mov rax, [pauMinuend + r11]
+ sbb rax, [pauSubtrahend + r11]
+ mov [pauResult + r11], rax
+ lea r11, [r11 + 8]
+ dec cUsed ; does not change CF.
+ jnz .small_loop
+ %ifdef RT_STRICT
+ jnc .done
+ int3
+ %endif
+.done:
+
+%elifdef RT_ARCH_X86
+ push edi
+ push esi
+ push ebx
+
+ mov edi, [ebp + 08h] ; pauResult
+ %define pauResult edi
+ mov ecx, [ebp + 0ch] ; pauMinuend
+ %define pauMinuend ecx
+ mov edx, [ebp + 10h] ; pauSubtrahend
+ %define pauSubtrahend edx
+ mov esi, [ebp + 14h] ; cUsed
+ %define cUsed esi
+
+ xor ebx, ebx ; index register.
+
+ cmp cUsed, 8 ; Skip the big loop if small number.
+ jb .small_job
+
+ shr cUsed, 3
+ clc
+.big_loop:
+ mov eax, [pauMinuend + ebx]
+ sbb eax, [pauSubtrahend + ebx]
+ mov [pauResult + ebx], eax
+ mov eax, [pauMinuend + ebx + 4]
+ sbb eax, [pauSubtrahend + ebx + 4]
+ mov [pauResult + ebx + 4], eax
+ mov eax, [pauMinuend + ebx + 8]
+ sbb eax, [pauSubtrahend + ebx + 8]
+ mov [pauResult + ebx + 8], eax
+ mov eax, [pauMinuend + ebx + 12]
+ sbb eax, [pauSubtrahend + ebx + 12]
+ mov [pauResult + ebx + 12], eax
+ mov eax, [pauMinuend + ebx + 16]
+ sbb eax, [pauSubtrahend + ebx + 16]
+ mov [pauResult + ebx + 16], eax
+ mov eax, [pauMinuend + ebx + 20]
+ sbb eax, [pauSubtrahend + ebx + 20]
+ mov [pauResult + ebx + 20], eax
+ mov eax, [pauMinuend + ebx + 24]
+ sbb eax, [pauSubtrahend + ebx + 24]
+ mov [pauResult + ebx + 24], eax
+ mov eax, [pauMinuend + ebx + 28]
+ sbb eax, [pauSubtrahend + ebx + 28]
+ mov [pauResult + ebx + 28], eax
+ lea ebx, [ebx + 32]
+ dec cUsed ; Does not change CF.
+ jnz .big_loop
+
+ lahf ; Save CF
+ mov cUsed, [ebp + 14h] ; Up to three final rounds.
+ and cUsed, 7
+ jz .done
+ sahf ; Restore CF.
+ jmp .small_loop ; Skip CF=1 (clc).
+
+.small_job:
+ clc
+.small_loop:
+ mov eax, [pauMinuend + ebx]
+ sbb eax, [pauSubtrahend + ebx]
+ mov [pauResult + ebx], eax
+ lea ebx, [ebx + 4]
+ dec cUsed ; Does not change CF
+ jnz .small_loop
+ %ifdef RT_STRICT
+ jnc .done
+ int3
+ %endif
+.done:
+
+ pop ebx
+ pop esi
+ pop edi
+%else
+ %error "Unsupported arch"
+%endif
+
+ leave
+ ret
+%undef pauResult
+%undef pauMinuend
+%undef pauSubtrahend
+%undef cUsed
+ENDPROC rtBigNumMagnitudeSubAssemblyWorker
+
+
+
+;;
+; Subtracts a number (pauSubtrahend) from a larger number (pauMinuend) and
+; stores the result in pauResult.
+;
+; All three numbers are zero padded such that a borrow can be carried one (or
+; two for 64-bit) elements beyond the end of the largest number.
+;
+; @returns nothing.
+; @param pauResultMinuend x86:[ebp + 8] gcc:rdi msc:rcx
+; @param pauSubtrahend x86:[ebp + 12] gcc:rsi msc:rdx
+; @param cUsed x86:[ebp + 16] gcc:rdx msc:r8
+;
+BEGINPROC rtBigNumMagnitudeSubThisAssemblyWorker
+ push xBP
+ SEH64_PUSH_xBP
+ mov xBP, xSP
+ SEH64_SET_FRAME_xBP 0
+SEH64_END_PROLOGUE
+
+%ifdef RT_ARCH_AMD64
+ %ifdef ASM_CALL64_GCC
+ %define pauResultMinuend rdi
+ %define pauSubtrahend rsi
+ %define cUsed edx
+ %else
+ %define pauResultMinuend rcx
+ %define pauSubtrahend rdx
+ %define cUsed r8d
+ %endif
+ xor r11d, r11d ; index register.
+
+ %if RTBIGNUM_ELEMENT_SIZE == 4
+ add cUsed, 1 ; cUsed = RT_ALIGN(cUsed, 2) / 2
+ shr cUsed, 1
+ %endif
+ cmp cUsed, 8 ; Skip the big loop if small number.
+ jb .small_job
+
+ mov r10d, cUsed
+ shr r10d, 3
+ clc
+.big_loop:
+ mov rax, [pauSubtrahend + r11]
+ sbb [pauResultMinuend + r11], rax
+ mov rax, [pauSubtrahend + r11 + 8]
+ sbb [pauResultMinuend + r11 + 8], rax
+ mov rax, [pauSubtrahend + r11 + 16]
+ sbb [pauResultMinuend + r11 + 16], rax
+ mov rax, [pauSubtrahend + r11 + 24]
+ sbb [pauResultMinuend + r11 + 24], rax
+ mov rax, [pauSubtrahend + r11 + 32]
+ sbb [pauResultMinuend + r11 + 32], rax
+ mov rax, [pauSubtrahend + r11 + 40]
+ sbb [pauResultMinuend + r11 + 40], rax
+ mov rax, [pauSubtrahend + r11 + 48]
+ sbb [pauResultMinuend + r11 + 48], rax
+ mov rax, [pauSubtrahend + r11 + 56]
+ sbb [pauResultMinuend + r11 + 56], rax
+ lea r11, [r11 + 64]
+ dec r10d ; Does not change CF.
+ jnz .big_loop
+
+ %if 0 ; Ancient AMD CPUs does have lahf/sahf, thus the mess in the %else.
+ lahf ; Save CF
+ and cUsed, 7 ; Up to seven odd rounds.
+ jz .done
+ sahf ; Restore CF.
+ jmp .small_loop ; Skip CF=1 (clc).
+ %else
+ jnc .no_carry
+ and cUsed, 7 ; Up to seven odd rounds.
+ jz .done
+ stc
+ jmp .small_loop ; Skip CF=1 (clc).
+.no_carry:
+ and cUsed, 7 ; Up to seven odd rounds.
+ jz .done
+ %endif
+.small_job:
+ clc
+.small_loop:
+ mov rax, [pauSubtrahend + r11]
+ sbb [pauResultMinuend + r11], rax
+ lea r11, [r11 + 8]
+ dec cUsed ; does not change CF.
+ jnz .small_loop
+ %ifdef RT_STRICT
+ jnc .done
+ int3
+ %endif
+.done:
+
+%elifdef RT_ARCH_X86
+ push edi
+ push ebx
+
+ mov edi, [ebp + 08h] ; pauResultMinuend
+ %define pauResultMinuend edi
+ mov edx, [ebp + 0ch] ; pauSubtrahend
+ %define pauSubtrahend edx
+ mov ecx, [ebp + 10h] ; cUsed
+ %define cUsed ecx
+
+ xor ebx, ebx ; index register.
+
+ cmp cUsed, 8 ; Skip the big loop if small number.
+ jb .small_job
+
+ shr cUsed, 3
+ clc
+.big_loop:
+ mov eax, [pauSubtrahend + ebx]
+ sbb [pauResultMinuend + ebx], eax
+ mov eax, [pauSubtrahend + ebx + 4]
+ sbb [pauResultMinuend + ebx + 4], eax
+ mov eax, [pauSubtrahend + ebx + 8]
+ sbb [pauResultMinuend + ebx + 8], eax
+ mov eax, [pauSubtrahend + ebx + 12]
+ sbb [pauResultMinuend + ebx + 12], eax
+ mov eax, [pauSubtrahend + ebx + 16]
+ sbb [pauResultMinuend + ebx + 16], eax
+ mov eax, [pauSubtrahend + ebx + 20]
+ sbb [pauResultMinuend + ebx + 20], eax
+ mov eax, [pauSubtrahend + ebx + 24]
+ sbb [pauResultMinuend + ebx + 24], eax
+ mov eax, [pauSubtrahend + ebx + 28]
+ sbb [pauResultMinuend + ebx + 28], eax
+ lea ebx, [ebx + 32]
+ dec cUsed ; Does not change CF.
+ jnz .big_loop
+
+ lahf ; Save CF
+ mov cUsed, [ebp + 10h] ; Up to seven odd rounds.
+ and cUsed, 7
+ jz .done
+ sahf ; Restore CF.
+ jmp .small_loop ; Skip CF=1 (clc).
+
+.small_job:
+ clc
+.small_loop:
+ mov eax, [pauSubtrahend + ebx]
+ sbb [pauResultMinuend + ebx], eax
+ lea ebx, [ebx + 4]
+ dec cUsed ; Does not change CF
+ jnz .small_loop
+ %ifdef RT_STRICT
+ jnc .done
+ int3
+ %endif
+.done:
+
+ pop ebx
+ pop edi
+%else
+ %error "Unsupported arch"
+%endif
+
+ leave
+ ret
+ENDPROC rtBigNumMagnitudeSubThisAssemblyWorker
+
+
+;;
+; Shifts an element array one bit to the left, returning the final carry value.
+;
+; On 64-bit hosts the array is always zero padded to a multiple of 8 bytes, so
+; we can use 64-bit operand sizes even if the element type is 32-bit.
+;
+; @returns The final carry value.
+; @param pauElements x86:[ebp + 8] gcc:rdi msc:rcx
+; @param cUsed x86:[ebp + 12] gcc:rsi msc:rdx
+; @param uCarry x86:[ebp + 16] gcc:rdx msc:r8
+;
+BEGINPROC rtBigNumMagnitudeShiftLeftOneAssemblyWorker
+ push xBP
+ SEH64_PUSH_xBP
+ mov xBP, xSP
+ SEH64_SET_FRAME_xBP 0
+SEH64_END_PROLOGUE
+
+%ifdef RT_ARCH_AMD64
+ %ifdef ASM_CALL64_GCC
+ %define pauElements rdi
+ %define cUsed esi
+ %define uCarry edx
+ %else
+ %define pauElements rcx
+ %define cUsed edx
+ %define uCarry r8d
+ %endif
+%elifdef RT_ARCH_X86
+ %define pauElements ecx
+ mov pauElements, [ebp + 08h]
+ %define cUsed edx
+ mov cUsed, [ebp + 0ch]
+ %define uCarry eax
+ mov uCarry, [ebp + 10h]
+%else
+ %error "Unsupported arch."
+%endif
+ ; Lots to do?
+ cmp cUsed, 8
+ jae .big_loop_init
+
+ ; Check for empty array.
+ test cUsed, cUsed
+ jz .no_elements
+ jmp .small_loop_init
+
+ ; Big loop - 8 unrolled loop iterations.
+.big_loop_init:
+%ifdef RT_ARCH_AMD64
+ mov r11d, cUsed
+%endif
+ shr cUsed, 3
+ test uCarry, uCarry ; clear the carry flag
+ jz .big_loop
+ stc
+.big_loop:
+%if RTBIGNUM_ELEMENT_SIZE == 8
+ rcl qword [pauElements], 1
+ rcl qword [pauElements + 8], 1
+ rcl qword [pauElements + 16], 1
+ rcl qword [pauElements + 24], 1
+ rcl qword [pauElements + 32], 1
+ rcl qword [pauElements + 40], 1
+ rcl qword [pauElements + 48], 1
+ rcl qword [pauElements + 56], 1
+ lea pauElements, [pauElements + 64]
+%else
+ rcl dword [pauElements], 1
+ rcl dword [pauElements + 4], 1
+ rcl dword [pauElements + 8], 1
+ rcl dword [pauElements + 12], 1
+ rcl dword [pauElements + 16], 1
+ rcl dword [pauElements + 20], 1
+ rcl dword [pauElements + 24], 1
+ rcl dword [pauElements + 28], 1
+ lea pauElements, [pauElements + 32]
+%endif
+ dec cUsed
+ jnz .big_loop
+
+ ; More to do?
+ pushf ; save carry flag (uCarry no longer used on x86).
+%ifdef RT_ARCH_AMD64
+ mov cUsed, r11d
+%else
+ mov cUsed, [ebp + 0ch]
+%endif
+ and cUsed, 7
+ jz .restore_cf_and_return ; Jump if we're good and done.
+ popf ; Restore CF.
+ jmp .small_loop ; Deal with the odd rounds.
+.restore_cf_and_return:
+ popf
+ jmp .carry_to_eax
+
+ ; Small loop - One round at the time.
+.small_loop_init:
+ test uCarry, uCarry ; clear the carry flag
+ jz .small_loop
+ stc
+.small_loop:
+%if RTBIGNUM_ELEMENT_SIZE == 8
+ rcl qword [pauElements], 1
+ lea pauElements, [pauElements + 8]
+%else
+ rcl dword [pauElements], 1
+ lea pauElements, [pauElements + 4]
+%endif
+ dec cUsed
+ jnz .small_loop
+
+ ; Calculate return value.
+.carry_to_eax:
+ mov eax, 0
+ jnc .return
+ inc eax
+.return:
+ leave
+ ret
+
+.no_elements:
+ mov eax, uCarry
+ jmp .return
+ENDPROC rtBigNumMagnitudeShiftLeftOneAssemblyWorker
+
+
+;;
+; Performs a 128-bit by 64-bit division on 64-bit and
+; a 64-bit by 32-bit divison on 32-bit.
+;
+; @returns nothing.
+; @param puQuotient x86:[ebp + 8] gcc:rdi msc:rcx Double element.
+; @param puRemainder x86:[ebp + 12] gcc:rsi msc:rdx Normal element.
+; @param uDividendHi x86:[ebp + 16] gcc:rdx msc:r8
+; @param uDividendLo x86:[ebp + 20] gcc:rcx msc:r9
+; @param uDivisior x86:[ebp + 24] gcc:r8 msc:[rbp + 30h]
+;
+BEGINPROC rtBigNumElement2xDiv2xBy1x
+ push xBP
+ SEH64_PUSH_xBP
+ mov xBP, xSP
+ SEH64_SET_FRAME_xBP 0
+SEH64_END_PROLOGUE
+
+%ifdef RT_ARCH_AMD64
+ %if RTBIGNUM_ELEMENT_SIZE == 4
+ %error "sorry not implemented yet."
+ sorry not implemented yet.
+ %endif
+
+ %define uDividendHi rdx
+ %define uDividendLo rax
+ %ifdef ASM_CALL64_GCC
+ %define uDivisor r8
+ %define puQuotient rdi
+ %define puRemainder rsi
+ mov rax, rcx
+ %else
+ %define puQuotient rcx
+ %define puRemainder r11
+ %define uDivisor r10
+ mov r11, rdx
+ mov r10, [rbp + 30h]
+ mov rdx, r8
+ mov rax, r9
+ %endif
+
+%elifdef RT_ARCH_X86
+ push edi
+ push ebx
+
+ %define uDividendHi edx
+ mov uDividendHi, [ebp + 10h]
+ %define uDividendLo eax
+ mov uDividendLo, [ebp + 14h]
+ %define uDivisor ecx
+ mov uDivisor, [ebp + 18h]
+ %define puQuotient edi
+ mov puQuotient, [ebp + 08h]
+ %define puRemainder ebx
+ mov puRemainder, [ebp + 0ch]
+%else
+ %error "Unsupported arch."
+%endif
+
+%ifdef RT_STRICT
+ ;
+ ; The dividend shall not be zero.
+ ;
+ test uDivisor, uDivisor
+ jnz .divisor_not_zero
+ int3
+.divisor_not_zero:
+%endif
+
+ ;
+ ; Avoid division overflow. This will calculate the high part of the quotient.
+ ;
+ mov RTBIGNUM_ELEMENT_PRE [puQuotient + RTBIGNUM_ELEMENT_SIZE], 0
+ cmp uDividendHi, uDivisor
+ jb .do_divide
+ push xAX
+ mov xAX, xDX
+ xor edx, edx
+ div uDivisor
+ mov RTBIGNUM_ELEMENT_PRE [puQuotient + RTBIGNUM_ELEMENT_SIZE], xAX
+ pop xAX
+
+ ;
+ ; Perform the division and store the result.
+ ;
+.do_divide:
+ div uDivisor
+ mov RTBIGNUM_ELEMENT_PRE [puQuotient], xAX
+ mov RTBIGNUM_ELEMENT_PRE [puRemainder], xDX
+
+
+%ifdef RT_ARCH_X86
+ pop ebx
+ pop edi
+%endif
+ leave
+ ret
+ENDPROC rtBigNumElement2xDiv2xBy1x
+
+
+;;
+; Performs the core of long multiplication.
+;
+; @returns nothing.
+; @param pauResult x86:[ebp + 8] gcc:rdi msc:rcx Initialized to zero.
+; @param pauMultiplier x86:[ebp + 12] gcc:rsi msc:rdx
+; @param cMultiplier x86:[ebp + 16] gcc:rdx msc:r8
+; @param pauMultiplicand x86:[ebp + 20] gcc:rcx msc:r9
+; @param cMultiplicand x86:[ebp + 24] gcc:r8 msc:[rbp + 30h]
+;
+BEGINPROC rtBigNumMagnitudeMultiplyAssemblyWorker
+ push xBP
+ SEH64_PUSH_xBP
+ mov xBP, xSP
+ SEH64_SET_FRAME_xBP 0
+SEH64_END_PROLOGUE
+
+%ifdef RT_ARCH_AMD64
+ %if RTBIGNUM_ELEMENT_SIZE == 4
+ %error "sorry not implemented yet."
+ sorry not implemented yet.
+ %endif
+
+ %ifdef ASM_CALL64_GCC
+ %define pauResult rdi
+ %define pauMultiplier rsi
+ %define cMultiplier r9
+ %define pauMultiplicand rcx
+ %define cMultiplicand r8
+ mov r9d, edx ; cMultiplier
+ mov r8d, r8d ; cMultiplicand - paranoia
+ %define uMultiplier r10
+ %define iMultiplicand r11
+ %else
+ %define pauResult rcx
+ %define pauMultiplier r11
+ %define cMultiplier r8
+ %define pauMultiplicand r9
+ %define cMultiplicand r10
+ mov pauMultiplier, rdx
+ mov r10d, dword [rbp + 30h] ; cMultiplicand
+ mov r8d, r8d ; cMultiplier - paranoia
+ %define uMultiplier r12
+ push r12
+ %define iMultiplicand r13
+ push r13
+ %endif
+
+%elifdef RT_ARCH_X86
+ push edi
+ push esi
+ push ebx
+ sub esp, 10h
+ %define pauResult edi
+ mov pauResult, [ebp + 08h]
+ %define pauMultiplier dword [ebp + 0ch]
+ %define cMultiplier dword [ebp + 10h]
+ %define pauMultiplicand ecx
+ mov pauMultiplicand, [ebp + 14h]
+ %define cMultiplicand dword [ebp + 18h]
+ %define uMultiplier dword [ebp - 10h]
+ %define iMultiplicand ebx
+
+%else
+ %error "Unsupported arch."
+%endif
+
+ ;
+ ; Check that the multiplicand isn't empty (avoids an extra jump in the inner loop).
+ ;
+ cmp cMultiplicand, 0
+ je .done
+
+ ;
+ ; Loop thru each element in the multiplier.
+ ;
+ ; while (cMultiplier-- > 0)
+.multiplier_loop:
+ cmp cMultiplier, 0
+ jz .done
+ dec cMultiplier
+
+ ; uMultiplier = *pauMultiplier
+%ifdef RT_ARCH_X86
+ mov edx, pauMultiplier
+ mov eax, [edx]
+ mov uMultiplier, eax
+%else
+ mov uMultiplier, [pauMultiplier]
+%endif
+ ; for (iMultiplicand = 0; iMultiplicand < cMultiplicand; iMultiplicand++)
+ xor iMultiplicand, iMultiplicand
+.multiplicand_loop:
+ mov xAX, [pauMultiplicand + iMultiplicand * RTBIGNUM_ELEMENT_SIZE]
+ mul uMultiplier
+ add [pauResult + iMultiplicand * RTBIGNUM_ELEMENT_SIZE], xAX
+ adc [pauResult + iMultiplicand * RTBIGNUM_ELEMENT_SIZE + RTBIGNUM_ELEMENT_SIZE], xDX
+ jnc .next_multiplicand
+ lea xDX, [iMultiplicand + 2]
+.next_adc:
+ adc RTBIGNUM_ELEMENT_PRE [pauResult + xDX * RTBIGNUM_ELEMENT_SIZE], 0
+ inc xDX
+ jc .next_adc
+
+.next_multiplicand:
+ inc iMultiplicand ; iMultiplicand++
+ cmp iMultiplicand, cMultiplicand ; iMultiplicand < cMultiplicand
+ jb .multiplicand_loop
+
+ ; Advance and loop on multiplier.
+ add pauMultiplier, RTBIGNUM_ELEMENT_SIZE
+ add pauResult, RTBIGNUM_ELEMENT_SIZE
+ jmp .multiplier_loop
+
+.done:
+
+%ifdef RT_ARCH_AMD64
+ %ifdef ASM_CALL64_GCC
+ %else
+ pop r13
+ pop r12
+ %endif
+%elifdef RT_ARCH_X86
+ add esp, 10h
+ pop ebx
+ pop esi
+ pop edi
+%endif
+ leave
+ ret
+ENDPROC rtBigNumMagnitudeMultiplyAssemblyWorker
+
+;;
+; Assembly implementation of the D4 step of Knuth's division algorithm.
+;
+; This subtracts Divisor * Qhat from the dividend at the current J index.
+;
+; @returns true if negative result (unlikely), false if positive.
+; @param pauDividendJ x86:[ebp + 8] gcc:rdi msc:rcx Initialized to zero.
+; @param pauDivisor x86:[ebp + 12] gcc:rsi msc:rdx
+; @param cDivisor x86:[ebp + 16] gcc:edx msc:r8d
+; @param uQhat x86:[ebp + 16] gcc:rcx msc:r9
+;
+BEGINPROC rtBigNumKnuthD4_MulSub
+ push xBP
+ SEH64_PUSH_xBP
+ mov xBP, xSP
+ SEH64_SET_FRAME_xBP 0
+SEH64_END_PROLOGUE
+
+%ifdef RT_ARCH_AMD64
+ %if RTBIGNUM_ELEMENT_SIZE == 4
+ %error "sorry not implemented yet."
+ sorry not implemented yet.
+ %endif
+
+ %ifdef ASM_CALL64_GCC
+ %define pauDividendJ rdi
+ %define pauDivisor rsi
+ %define cDivisor r8
+ %define uQhat rcx
+ mov r8d, edx ; cDivisor
+ %define uMulCarry r11
+ %else
+ %define pauDividendJ rcx
+ %define pauDivisor r10
+ %define cDivisor r8
+ %define uQhat r9
+ mov r10, rdx ; pauDivisor
+ mov r8d, r8d ; cDivisor - paranoia
+ %define uMulCarry r11
+ %endif
+
+%elifdef RT_ARCH_X86
+ push edi
+ push esi
+ push ebx
+ %define pauDividendJ edi
+ mov pauDividendJ, [ebp + 08h]
+ %define pauDivisor esi
+ mov pauDivisor, [ebp + 0ch]
+ %define cDivisor ecx
+ mov cDivisor, [ebp + 10h]
+ %define uQhat dword [ebp + 14h]
+ %define uMulCarry ebx
+%else
+ %error "Unsupported arch."
+%endif
+
+%ifdef RT_STRICT
+ ;
+ ; Some sanity checks.
+ ;
+ cmp cDivisor, 0
+ jne .cDivisor_not_zero
+ int3
+.cDivisor_not_zero:
+%endif
+
+ ;
+ ; Initialize the loop.
+ ;
+ xor uMulCarry, uMulCarry
+
+ ;
+ ; do ... while (cDivisor-- > 0);
+ ;
+.the_loop:
+ ; RTUInt128MulU64ByU64(&uSub, uQhat, pauDivisor[i]);
+ mov xAX, uQhat
+ mul RTBIGNUM_ELEMENT_PRE [pauDivisor]
+ ; RTUInt128AssignAddU64(&uSub, uMulCarry);
+ add xAX, uMulCarry
+ adc xDX, 0
+ mov uMulCarry, xDX
+ ; Subtract uSub.s.Lo+fCarry from pauDividendJ[i]
+ sub [pauDividendJ], xAX
+ adc uMulCarry, 0
+%ifdef RT_STRICT
+ jnc .uMulCarry_did_not_overflow
+ int3
+.uMulCarry_did_not_overflow
+%endif
+
+ ; Advance.
+ add pauDividendJ, RTBIGNUM_ELEMENT_SIZE
+ add pauDivisor, RTBIGNUM_ELEMENT_SIZE
+ dec cDivisor
+ jnz .the_loop
+
+ ;
+ ; Final dividend element (no corresponding divisor element).
+ ;
+ sub [pauDividendJ], uMulCarry
+ sbb eax, eax
+ and eax, 1
+
+.done:
+%ifdef RT_ARCH_AMD64
+%elifdef RT_ARCH_X86
+ pop ebx
+ pop esi
+ pop edi
+%endif
+ leave
+ ret
+ENDPROC rtBigNumKnuthD4_MulSub
+