diff options
Diffstat (limited to 'src/math/big/arith_s390x.s')
-rw-r--r-- | src/math/big/arith_s390x.s | 795 |
1 files changed, 795 insertions, 0 deletions
diff --git a/src/math/big/arith_s390x.s b/src/math/big/arith_s390x.s new file mode 100644 index 0000000..caa4db0 --- /dev/null +++ b/src/math/big/arith_s390x.s @@ -0,0 +1,795 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go,s390x + +#include "textflag.h" + +// This file provides fast assembly versions for the elementary +// arithmetic operations on vectors implemented in arith.go. + +TEXT ·mulWW(SB), NOSPLIT, $0 + MOVD x+0(FP), R3 + MOVD y+8(FP), R4 + MULHDU R3, R4 + MOVD R10, z1+16(FP) + MOVD R11, z0+24(FP) + RET + + +// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 +// func addVV(z, x, y []Word) (c Word) + +TEXT ·addVV(SB), NOSPLIT, $0 + MOVD addvectorfacility+0x00(SB), R1 + BR (R1) + +TEXT ·addVV_check(SB), NOSPLIT, $0 + MOVB ·hasVX(SB), R1 + CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported + MOVD $addvectorfacility+0x00(SB), R1 + MOVD $·addVV_novec(SB), R2 + MOVD R2, 0(R1) + + // MOVD $·addVV_novec(SB), 0(R1) + BR ·addVV_novec(SB) + +vectorimpl: + MOVD $addvectorfacility+0x00(SB), R1 + MOVD $·addVV_vec(SB), R2 + MOVD R2, 0(R1) + + // MOVD $·addVV_vec(SB), 0(R1) + BR ·addVV_vec(SB) + +GLOBL addvectorfacility+0x00(SB), NOPTR, $8 +DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB) + +TEXT ·addVV_vec(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R2 + + MOVD $0, R4 // c = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUB $4, R3 + BLT v1 + SUB $12, R3 // n -= 16 + BLT A1 // if n < 0 goto A1 + + MOVD R8, R5 + MOVD R9, R6 + MOVD R2, R7 + + // n >= 0 + // regular loop body unrolled 16x + VZERO V0 // c = 0 + +UU1: + VLM 0(R5), V1, V4 // 64-bytes into V1..V8 + ADD $64, R5 + VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order + VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order + + VLM 0(R6), V9, V12 // 64-bytes into V9..V16 + ADD $64, R6 + VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order + VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order + + VACCCQ V1, V9, V0, V25 + VACQ V1, V9, V0, V17 + VACCCQ V2, V10, V25, V26 + VACQ V2, V10, V25, V18 + + VLM 0(R5), V5, V6 // 32-bytes into V1..V8 + VLM 0(R6), V13, V14 // 32-bytes into V9..V16 + ADD $32, R5 + ADD $32, R6 + + VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order + VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order + VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order + VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order + + VACCCQ V3, V11, V26, V27 + VACQ V3, V11, V26, V19 + VACCCQ V4, V12, V27, V28 + VACQ V4, V12, V27, V20 + + VLM 0(R5), V7, V8 // 32-bytes into V1..V8 + VLM 0(R6), V15, V16 // 32-bytes into V9..V16 + ADD $32, R5 + ADD $32, R6 + + VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order + VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order + VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order + VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order + + VACCCQ V5, V13, V28, V29 + VACQ V5, V13, V28, V21 + VACCCQ V6, V14, V29, V30 + VACQ V6, V14, V29, V22 + + VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order + VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order + VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order + VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order + + VACCCQ V7, V15, V30, V31 + VACQ V7, V15, V30, V23 + VACCCQ V8, V16, V31, V0 // V0 has carry-over + VACQ V8, V16, V31, V24 + + VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order + VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order + VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order + VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order + VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order + VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order + VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order + VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order + VSTM V17, V24, 0(R7) // 128-bytes into z + ADD $128, R7 + ADD $128, R10 // i += 16 + SUB $16, R3 // n -= 16 + BGE UU1 // if n >= 0 goto U1 + VLGVG $1, V0, R4 // put cf into R4 + NEG R4, R4 // save cf + +A1: + ADD $12, R3 // n += 16 + + // s/JL/JMP/ below to disable the unrolled loop + BLT v1 // if n < 0 goto v1 + +U1: // n >= 0 + // regular loop body unrolled 4x + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + ADDC R4, R4 // restore CF + MOVD 0(R9)(R10*1), R11 + ADDE R11, R5 + MOVD 8(R9)(R10*1), R11 + ADDE R11, R6 + MOVD 16(R9)(R10*1), R11 + ADDE R11, R7 + MOVD 24(R9)(R10*1), R11 + ADDE R11, R1 + MOVD R0, R4 + ADDE R4, R4 // save CF + NEG R4, R4 + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + ADD $32, R10 // i += 4 + SUB $4, R3 // n -= 4 + BGE U1 // if n >= 0 goto U1 + +v1: + ADD $4, R3 // n += 4 + BLE E1 // if n <= 0 goto E1 + +L1: // n > 0 + ADDC R4, R4 // restore CF + MOVD 0(R8)(R10*1), R5 + MOVD 0(R9)(R10*1), R11 + ADDE R11, R5 + MOVD R5, 0(R2)(R10*1) + MOVD R0, R4 + ADDE R4, R4 // save CF + NEG R4, R4 + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L1 // if n > 0 goto L1 + +E1: + NEG R4, R4 + MOVD R4, c+72(FP) // return c + RET + +TEXT ·addVV_novec(SB), NOSPLIT, $0 +novec: + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R2 + + MOVD $0, R4 // c = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUB $4, R3 // n -= 4 + BLT v1n // if n < 0 goto v1n + +U1n: // n >= 0 + // regular loop body unrolled 4x + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + ADDC R4, R4 // restore CF + MOVD 0(R9)(R10*1), R11 + ADDE R11, R5 + MOVD 8(R9)(R10*1), R11 + ADDE R11, R6 + MOVD 16(R9)(R10*1), R11 + ADDE R11, R7 + MOVD 24(R9)(R10*1), R11 + ADDE R11, R1 + MOVD R0, R4 + ADDE R4, R4 // save CF + NEG R4, R4 + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + ADD $32, R10 // i += 4 + SUB $4, R3 // n -= 4 + BGE U1n // if n >= 0 goto U1n + +v1n: + ADD $4, R3 // n += 4 + BLE E1n // if n <= 0 goto E1n + +L1n: // n > 0 + ADDC R4, R4 // restore CF + MOVD 0(R8)(R10*1), R5 + MOVD 0(R9)(R10*1), R11 + ADDE R11, R5 + MOVD R5, 0(R2)(R10*1) + MOVD R0, R4 + ADDE R4, R4 // save CF + NEG R4, R4 + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L1n // if n > 0 goto L1n + +E1n: + NEG R4, R4 + MOVD R4, c+72(FP) // return c + RET + +TEXT ·subVV(SB), NOSPLIT, $0 + MOVD subvectorfacility+0x00(SB), R1 + BR (R1) + +TEXT ·subVV_check(SB), NOSPLIT, $0 + MOVB ·hasVX(SB), R1 + CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported + MOVD $subvectorfacility+0x00(SB), R1 + MOVD $·subVV_novec(SB), R2 + MOVD R2, 0(R1) + + // MOVD $·subVV_novec(SB), 0(R1) + BR ·subVV_novec(SB) + +vectorimpl: + MOVD $subvectorfacility+0x00(SB), R1 + MOVD $·subVV_vec(SB), R2 + MOVD R2, 0(R1) + + // MOVD $·subVV_vec(SB), 0(R1) + BR ·subVV_vec(SB) + +GLOBL subvectorfacility+0x00(SB), NOPTR, $8 +DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB) + +// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 +// func subVV(z, x, y []Word) (c Word) +// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) +TEXT ·subVV_vec(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R2 + MOVD $0, R4 // c = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUB $4, R3 // n -= 4 + BLT v1 // if n < 0 goto v1 + SUB $12, R3 // n -= 16 + BLT A1 // if n < 0 goto A1 + + MOVD R8, R5 + MOVD R9, R6 + MOVD R2, R7 + + // n >= 0 + // regular loop body unrolled 16x + VZERO V0 // cf = 0 + MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow) + VLVGG $1, R4, V0 // put carry into V0 + +UU1: + VLM 0(R5), V1, V4 // 64-bytes into V1..V8 + ADD $64, R5 + VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order + VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order + + VLM 0(R6), V9, V12 // 64-bytes into V9..V16 + ADD $64, R6 + VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order + VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order + + VSBCBIQ V1, V9, V0, V25 + VSBIQ V1, V9, V0, V17 + VSBCBIQ V2, V10, V25, V26 + VSBIQ V2, V10, V25, V18 + + VLM 0(R5), V5, V6 // 32-bytes into V1..V8 + VLM 0(R6), V13, V14 // 32-bytes into V9..V16 + ADD $32, R5 + ADD $32, R6 + + VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order + VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order + VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order + VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order + + VSBCBIQ V3, V11, V26, V27 + VSBIQ V3, V11, V26, V19 + VSBCBIQ V4, V12, V27, V28 + VSBIQ V4, V12, V27, V20 + + VLM 0(R5), V7, V8 // 32-bytes into V1..V8 + VLM 0(R6), V15, V16 // 32-bytes into V9..V16 + ADD $32, R5 + ADD $32, R6 + + VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order + VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order + VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order + VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order + + VSBCBIQ V5, V13, V28, V29 + VSBIQ V5, V13, V28, V21 + VSBCBIQ V6, V14, V29, V30 + VSBIQ V6, V14, V29, V22 + + VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order + VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order + VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order + VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order + + VSBCBIQ V7, V15, V30, V31 + VSBIQ V7, V15, V30, V23 + VSBCBIQ V8, V16, V31, V0 // V0 has carry-over + VSBIQ V8, V16, V31, V24 + + VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order + VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order + VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order + VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order + VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order + VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order + VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order + VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order + VSTM V17, V24, 0(R7) // 128-bytes into z + ADD $128, R7 + ADD $128, R10 // i += 16 + SUB $16, R3 // n -= 16 + BGE UU1 // if n >= 0 goto U1 + VLGVG $1, V0, R4 // put cf into R4 + SUB $1, R4 // save cf + +A1: + ADD $12, R3 // n += 16 + BLT v1 // if n < 0 goto v1 + +U1: // n >= 0 + // regular loop body unrolled 4x + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + MOVD R0, R11 + SUBC R4, R11 // restore CF + MOVD 0(R9)(R10*1), R11 + SUBE R11, R5 + MOVD 8(R9)(R10*1), R11 + SUBE R11, R6 + MOVD 16(R9)(R10*1), R11 + SUBE R11, R7 + MOVD 24(R9)(R10*1), R11 + SUBE R11, R1 + MOVD R0, R4 + SUBE R4, R4 // save CF + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + ADD $32, R10 // i += 4 + SUB $4, R3 // n -= 4 + BGE U1 // if n >= 0 goto U1n + +v1: + ADD $4, R3 // n += 4 + BLE E1 // if n <= 0 goto E1 + +L1: // n > 0 + MOVD R0, R11 + SUBC R4, R11 // restore CF + MOVD 0(R8)(R10*1), R5 + MOVD 0(R9)(R10*1), R11 + SUBE R11, R5 + MOVD R5, 0(R2)(R10*1) + MOVD R0, R4 + SUBE R4, R4 // save CF + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L1 // if n > 0 goto L1n + +E1: + NEG R4, R4 + MOVD R4, c+72(FP) // return c + RET + +// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 +// func subVV(z, x, y []Word) (c Word) +// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) +TEXT ·subVV_novec(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R2 + + MOVD $0, R4 // c = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUB $4, R3 // n -= 4 + BLT v1 // if n < 0 goto v1 + +U1: // n >= 0 + // regular loop body unrolled 4x + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + MOVD R0, R11 + SUBC R4, R11 // restore CF + MOVD 0(R9)(R10*1), R11 + SUBE R11, R5 + MOVD 8(R9)(R10*1), R11 + SUBE R11, R6 + MOVD 16(R9)(R10*1), R11 + SUBE R11, R7 + MOVD 24(R9)(R10*1), R11 + SUBE R11, R1 + MOVD R0, R4 + SUBE R4, R4 // save CF + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + ADD $32, R10 // i += 4 + SUB $4, R3 // n -= 4 + BGE U1 // if n >= 0 goto U1 + +v1: + ADD $4, R3 // n += 4 + BLE E1 // if n <= 0 goto E1 + +L1: // n > 0 + MOVD R0, R11 + SUBC R4, R11 // restore CF + MOVD 0(R8)(R10*1), R5 + MOVD 0(R9)(R10*1), R11 + SUBE R11, R5 + MOVD R5, 0(R2)(R10*1) + MOVD R0, R4 + SUBE R4, R4 // save CF + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L1 // if n > 0 goto L1 + +E1: + NEG R4, R4 + MOVD R4, c+72(FP) // return c + RET + +TEXT ·addVW(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R5 // length of z + MOVD x+24(FP), R6 + MOVD y+48(FP), R7 // c = y + MOVD z+0(FP), R8 + + CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return + + // Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag. + ADDC 0(R6), R7 + MOVD R7, 0(R8) + CMPBEQ R5, $1, returnResult // len(z) == 1 + MOVD $0, R9 + ADDE 8(R6), R9 + MOVD R9, 8(R8) + CMPBEQ R5, $2, returnResult // len(z) == 2 + + // Update the counters + MOVD $16, R12 // i = 2 + MOVD $-2(R5), R5 // n = n - 2 + +loopOverEachWord: + BRC $12, copySetup // carry = 0, copy the rest + MOVD $1, R9 + + // Originally we used the carry flag generated in the previous iteration + // (i.e: ADDE could be used here to do the addition). However, since we + // already know carry is 1 (otherwise we will go to copy section), we can use + // ADDC here so the current iteration does not depend on the carry flag + // generated in the previous iteration. This could be useful when branch prediction happens. + ADDC 0(R6)(R12*1), R9 + MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c + + MOVD $8(R12), R12 // i++ + BRCTG R5, loopOverEachWord // n-- + +// Return the current carry value +returnResult: + MOVD $0, R0 + ADDE R0, R0 + MOVD R0, c+56(FP) + RET + +// Update position of x(R6) and z(R8) based on the current counter value and perform copying. +// With the assumption that x and z will not overlap with each other or x and z will +// point to same memory region, we can use a faster version of copy using only MVC here. +// In the following implementation, we have three copy loops, each copying a word, 4 words, and +// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. +copySetup: + ADD R12, R6 + ADD R12, R8 + + CMPBGE R5, $4, mediumLoop + +smallLoop: // does a loop unrolling to copy word when n < 4 + CMPBEQ R5, $0, returnZero + MVC $8, 0(R6), 0(R8) + CMPBEQ R5, $1, returnZero + MVC $8, 8(R6), 8(R8) + CMPBEQ R5, $2, returnZero + MVC $8, 16(R6), 16(R8) + +returnZero: + MOVD $0, c+56(FP) // return 0 as carry + RET + +mediumLoop: + CMPBLT R5, $4, smallLoop + CMPBLT R5, $32, mediumLoopBody + +largeLoop: // Copying 256 bytes at a time. + MVC $256, 0(R6), 0(R8) + MOVD $256(R6), R6 + MOVD $256(R8), R8 + MOVD $-32(R5), R5 + CMPBGE R5, $32, largeLoop + BR mediumLoop + +mediumLoopBody: // Copying 32 bytes at a time + MVC $32, 0(R6), 0(R8) + MOVD $32(R6), R6 + MOVD $32(R8), R8 + MOVD $-4(R5), R5 + CMPBGE R5, $4, mediumLoopBody + BR smallLoop + +returnC: + MOVD R7, c+56(FP) + RET + +TEXT ·subVW(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R5 + MOVD x+24(FP), R6 + MOVD y+48(FP), R7 // The borrow bit passed in + MOVD z+0(FP), R8 + MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it. + + CMPBEQ R5, $0, returnC // len(z) == 0, have an early return + + // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag + MOVD 0(R6), R9 + SUBC R7, R9 + MOVD R9, 0(R8) + CMPBEQ R5, $1, returnResult + MOVD 8(R6), R9 + SUBE R0, R9 + MOVD R9, 8(R8) + CMPBEQ R5, $2, returnResult + + // Update the counters + MOVD $16, R12 // i = 2 + MOVD $-2(R5), R5 // n = n - 2 + +loopOverEachWord: + BRC $3, copySetup // no borrow, copy the rest + MOVD 0(R6)(R12*1), R9 + + // Originally we used the borrow flag generated in the previous iteration + // (i.e: SUBE could be used here to do the subtraction). However, since we + // already know borrow is 1 (otherwise we will go to copy section), we can + // use SUBC here so the current iteration does not depend on the borrow flag + // generated in the previous iteration. This could be useful when branch prediction happens. + SUBC $1, R9 + MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1 + + MOVD $8(R12), R12 // i++ + BRCTG R5, loopOverEachWord // n-- + +// return the current borrow value +returnResult: + SUBE R0, R0 + NEG R0, R0 + MOVD R0, c+56(FP) + RET + +// Update position of x(R6) and z(R8) based on the current counter value and perform copying. +// With the assumption that x and z will not overlap with each other or x and z will +// point to same memory region, we can use a faster version of copy using only MVC here. +// In the following implementation, we have three copy loops, each copying a word, 4 words, and +// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. +copySetup: + ADD R12, R6 + ADD R12, R8 + + CMPBGE R5, $4, mediumLoop + +smallLoop: // does a loop unrolling to copy word when n < 4 + CMPBEQ R5, $0, returnZero + MVC $8, 0(R6), 0(R8) + CMPBEQ R5, $1, returnZero + MVC $8, 8(R6), 8(R8) + CMPBEQ R5, $2, returnZero + MVC $8, 16(R6), 16(R8) + +returnZero: + MOVD $0, c+56(FP) // return 0 as borrow + RET + +mediumLoop: + CMPBLT R5, $4, smallLoop + CMPBLT R5, $32, mediumLoopBody + +largeLoop: // Copying 256 bytes at a time + MVC $256, 0(R6), 0(R8) + MOVD $256(R6), R6 + MOVD $256(R8), R8 + MOVD $-32(R5), R5 + CMPBGE R5, $32, largeLoop + BR mediumLoop + +mediumLoopBody: // Copying 32 bytes at a time + MVC $32, 0(R6), 0(R8) + MOVD $32(R6), R6 + MOVD $32(R8), R8 + MOVD $-4(R5), R5 + CMPBGE R5, $4, mediumLoopBody + BR smallLoop + +returnC: + MOVD R7, c+56(FP) + RET + +// func shlVU(z, x []Word, s uint) (c Word) +TEXT ·shlVU(SB), NOSPLIT, $0 + BR ·shlVU_g(SB) + +// func shrVU(z, x []Word, s uint) (c Word) +TEXT ·shrVU(SB), NOSPLIT, $0 + BR ·shrVU_g(SB) + +// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i +// func mulAddVWW(z, x []Word, y, r Word) (c Word) +TEXT ·mulAddVWW(SB), NOSPLIT, $0 + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD r+56(FP), R4 // c = r + MOVD z_len+8(FP), R5 + MOVD $0, R1 // i = 0 + MOVD $0, R7 // i*8 = 0 + MOVD $0, R0 // make sure it's zero + BR E5 + +L5: + MOVD (R8)(R1*1), R6 + MULHDU R9, R6 + ADDC R4, R11 // add to low order bits + ADDE R0, R6 + MOVD R11, (R2)(R1*1) + MOVD R6, R4 + ADD $8, R1 // i*8 + 8 + ADD $1, R7 // i++ + +E5: + CMPBLT R7, R5, L5 // i < n + + MOVD R4, c+64(FP) + RET + +// func addMulVVW(z, x []Word, y Word) (c Word) +// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i +TEXT ·addMulVVW(SB), NOSPLIT, $0 + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z_len+8(FP), R5 + + MOVD $0, R1 // i*8 = 0 + MOVD $0, R7 // i = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R4 // c = 0 + + MOVD R5, R12 + AND $-2, R12 + CMPBGE R5, $2, A6 + BR E6 + +A6: + MOVD (R8)(R1*1), R6 + MULHDU R9, R6 + MOVD (R2)(R1*1), R10 + ADDC R10, R11 // add to low order bits + ADDE R0, R6 + ADDC R4, R11 + ADDE R0, R6 + MOVD R6, R4 + MOVD R11, (R2)(R1*1) + + MOVD (8)(R8)(R1*1), R6 + MULHDU R9, R6 + MOVD (8)(R2)(R1*1), R10 + ADDC R10, R11 // add to low order bits + ADDE R0, R6 + ADDC R4, R11 + ADDE R0, R6 + MOVD R6, R4 + MOVD R11, (8)(R2)(R1*1) + + ADD $16, R1 // i*8 + 8 + ADD $2, R7 // i++ + + CMPBLT R7, R12, A6 + BR E6 + +L6: + MOVD (R8)(R1*1), R6 + MULHDU R9, R6 + MOVD (R2)(R1*1), R10 + ADDC R10, R11 // add to low order bits + ADDE R0, R6 + ADDC R4, R11 + ADDE R0, R6 + MOVD R6, R4 + MOVD R11, (R2)(R1*1) + + ADD $8, R1 // i*8 + 8 + ADD $1, R7 // i++ + +E6: + CMPBLT R7, R5, L6 // i < n + + MOVD R4, c+56(FP) + RET + |