summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg/indexbyte_ppc64x.s
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:23:18 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:23:18 +0000
commit43a123c1ae6613b3efeed291fa552ecd909d3acf (patch)
treefd92518b7024bc74031f78a1cf9e454b65e73665 /src/internal/bytealg/indexbyte_ppc64x.s
parentInitial commit. (diff)
downloadgolang-1.20-upstream.tar.xz
golang-1.20-upstream.zip
Adding upstream version 1.20.14.upstream/1.20.14upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/internal/bytealg/indexbyte_ppc64x.s')
-rw-r--r--src/internal/bytealg/indexbyte_ppc64x.s391
1 files changed, 391 insertions, 0 deletions
diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s
new file mode 100644
index 0000000..1a6e852
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@@ -0,0 +1,391 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+ // R3 = byte array pointer
+ // R4 = length
+ MOVD R6, R5 // R5 = byte
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+ BR indexbytebody<>(SB)
+
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
+ // R3 = string
+ // R4 = length
+ // R5 = byte
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+ BR indexbytebody<>(SB)
+
+// R3 = addr of string
+// R4 = len of string
+// R5 = byte to find
+// R16 = 1 if running on a POWER9 system, 0 otherwise
+// On exit:
+// R3 = return value
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
+ MOVD R3,R17 // Save base address for calculating the index later.
+ RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
+ RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
+ ADD R4,R3,R7 // Last acceptable address in R7.
+
+ RLDIMI $16,R5,$32,R5
+ CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently.
+ MOVD $-1,R9
+ RLWNM $3,R3,$26,$28,R6 // shift amount for mask (r3&0x7)*8
+ RLDIMI $32,R5,$0,R5
+ MOVD R7,R10 // Save last acceptable address in R10 for later.
+ ADD $-1,R7,R7
+#ifdef GOARCH_ppc64le
+ SLD R6,R9,R9 // Prepare mask for Little Endian
+#else
+ SRD R6,R9,R9 // Same for Big Endian
+#endif
+ BLT small_string // Jump to the small string case if it's <32 bytes.
+ CMP R16,$1 // optimize for power8 v power9
+ BNE power8
+ VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
+ MTVRD R5,V1
+ LVSL (R0+R0),V11 // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
+ VSLB V11,V10,V10 // to extract the first bit of match result into GPR
+ VSPLTB $7,V1,V1 // Replicate byte across V1
+ CMP R4,$64
+ MOVD $16,R11
+ MOVD R3,R8
+ BLT cmp32
+ MOVD $32,R12
+ MOVD $48,R6
+
+loop64:
+ LXVB16X (R0)(R8),V2 // scan 64 bytes at a time
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0 // match found at R8, jump out
+
+ LXVB16X (R8)(R11),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat1 // match found at R8+16 bytes, jump out
+
+ LXVB16X (R8)(R12),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat2 // match found at R8+32 bytes, jump out
+
+ LXVB16X (R8)(R6),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat3 // match found at R8+48 bytes, jump out
+ ADD $64,R8
+ ADD $-64,R4
+ CMP R4,$64 // >=64 bytes left to scan?
+ BGE loop64
+ CMP R4,$32
+ BLT rem // jump to rem if there are < 32 bytes left
+cmp32:
+ LXVB16X (R0)(R8),V2 // 32-63 bytes left
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0 // match found at R8
+
+ LXVB16X (R11)(R8),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat1 // match found at R8+16
+
+ ADD $32,R8
+ ADD $-32,R4
+rem:
+ RLDICR $0,R8,$60,R8 // align address to reuse code for tail end processing
+ BR small_string
+
+foundat3:
+ ADD $16,R8
+foundat2:
+ ADD $16,R8
+foundat1:
+ ADD $16,R8
+foundat0:
+ // Compress the result into a single doubleword and
+ // move it to a GPR for the final calculation.
+ VBPERMQ V6,V10,V6
+ MFVRD V6,R3
+ // count leading zeroes upto the match that ends up in low 16 bits
+ // in both endian modes, compute index by subtracting the number by 16
+ CNTLZW R3,R11
+ ADD $-16,R11
+ ADD R8,R11,R3 // Calculate byte address
+ SUB R17,R3
+ RET
+power8:
+ // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
+ // in V0, V1 and V10, then branch to the preloop.
+ ANDCC $63,R3,R11
+ BEQ CR0,qw_align
+ RLDICL $0,R3,$61,R11
+
+ MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
+ CMPB R12,R5,R3 // Check for a match.
+ AND R9,R3,R3 // Mask bytes below s_base
+ RLDICR $0,R7,$60,R7 // Last doubleword in R7
+ CMPU R3,$0,CR7 // If we have a match, jump to the final computation
+ BNE CR7,done
+ ADD $8,R8,R8
+ ADD $-8,R4,R4
+ ADD R4,R11,R4
+
+ // Check for quadword alignment
+ ANDCC $15,R8,R11
+ BEQ CR0,qw_align
+
+ // Not aligned, so handle the next doubleword
+ MOVD 0(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR7
+ BNE CR7,done
+ ADD $8,R8,R8
+ ADD $-8,R4,R4
+
+ // Either quadword aligned or 64-byte at this point. We can use LVX.
+qw_align:
+
+ // Set up auxiliary data for the vectorized algorithm.
+ VSPLTISB $0,V0 // Replicate 0 across V0
+ VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
+ MTVRD R5,V1
+ LVSL (R0+R0),V11
+ VSLB V11,V10,V10
+ VSPLTB $7,V1,V1 // Replicate byte across V1
+ CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop
+ BLE tail
+
+ // We will load 4 quardwords per iteration in the loop, so check for
+ // 64-byte alignment. If 64-byte aligned, then branch to the preloop.
+ ANDCC $63,R8,R11
+ BEQ CR0,preloop
+
+ // Not 64-byte aligned. Load one quadword at a time until aligned.
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6 // Check for byte in V4
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ ADD $-16,R4,R4
+
+ ANDCC $63,R8,R11
+ BEQ CR0,preloop
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6 // Check for byte in V4
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ ADD $-16,R4,R4
+
+ ANDCC $63,R8,R11
+ BEQ CR0,preloop
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6 // Check for byte in V4
+ BNE CR6,found_qw_align
+ ADD $-16,R4,R4
+ ADD $16,R8,R8
+
+ // 64-byte aligned. Prepare for the main loop.
+preloop:
+ CMPU R4,$64
+ BLE tail // If len ≤ 64, don't use the vectorized loop
+
+ // We are now aligned to a 64-byte boundary. We will load 4 quadwords
+ // per loop iteration. The last doubleword is in R10, so our loop counter
+ // starts at (R10-R8)/64.
+ SUB R8,R10,R6
+ SRD $6,R6,R9 // Loop counter in R9
+ MOVD R9,CTR
+
+ ADD $-64,R8,R8 // Adjust index for loop entry
+ MOVD $16,R11 // Load offsets for the vector loads
+ MOVD $32,R9
+ MOVD $48,R7
+
+ // Main loop we will load 64 bytes per iteration
+loop:
+ ADD $64,R8,R8 // Fuse addi+lvx for performance
+ LVX (R8+R0),V2 // Load 4 16-byte vectors
+ LVX (R8+R11),V3
+ VCMPEQUB V1,V2,V6 // Look for byte in each vector
+ VCMPEQUB V1,V3,V7
+
+ LVX (R8+R9),V4
+ LVX (R8+R7),V5
+ VCMPEQUB V1,V4,V8
+ VCMPEQUB V1,V5,V9
+
+ VOR V6,V7,V11 // Compress the result in a single vector
+ VOR V8,V9,V12
+ VOR V11,V12,V13
+ VCMPEQUBCC V0,V13,V14 // Check for byte
+ BGE CR6,found
+ BC 16,0,loop // bdnz loop
+
+ // Handle the tailing bytes or R4 ≤ 64
+ RLDICL $0,R6,$58,R4
+ ADD $64,R8,R8
+tail:
+ CMPU R4,$0
+ BEQ notfound
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ CMPU R4,$16,CR6
+ BLE CR6,notfound
+ ADD $-16,R4,R4
+
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ CMPU R4,$16,CR6
+ BLE CR6,notfound
+ ADD $-16,R4,R4
+
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ CMPU R4,$16,CR6
+ BLE CR6,notfound
+ ADD $-16,R4,R4
+
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6
+ BNE CR6,found_qw_align
+
+notfound:
+ MOVD $-1, R3
+ RET
+
+found:
+ // We will now compress the results into a single doubleword,
+ // so it can be moved to a GPR for the final index calculation.
+
+ // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
+ // first bit of each byte into bits 48-63.
+ VBPERMQ V6,V10,V6
+ VBPERMQ V7,V10,V7
+ VBPERMQ V8,V10,V8
+ VBPERMQ V9,V10,V9
+
+ // Shift each 16-bit component into its correct position for
+ // merging into a single doubleword.
+#ifdef GOARCH_ppc64le
+ VSLDOI $2,V7,V7,V7
+ VSLDOI $4,V8,V8,V8
+ VSLDOI $6,V9,V9,V9
+#else
+ VSLDOI $6,V6,V6,V6
+ VSLDOI $4,V7,V7,V7
+ VSLDOI $2,V8,V8,V8
+#endif
+
+ // Merge V6-V9 into a single doubleword and move to a GPR.
+ VOR V6,V7,V11
+ VOR V8,V9,V4
+ VOR V4,V11,V4
+ MFVRD V4,R3
+
+#ifdef GOARCH_ppc64le
+ ADD $-1,R3,R11
+ ANDN R3,R11,R11
+ POPCNTD R11,R11 // Count trailing zeros (Little Endian).
+#else
+ CNTLZD R3,R11 // Count leading zeros (Big Endian).
+#endif
+ ADD R8,R11,R3 // Calculate byte address
+
+return:
+ SUB R17, R3
+ RET
+
+found_qw_align:
+ // Use the same algorithm as above. Compress the result into
+ // a single doubleword and move it to a GPR for the final
+ // calculation.
+ VBPERMQ V6,V10,V6
+
+#ifdef GOARCH_ppc64le
+ MFVRD V6,R3
+ ADD $-1,R3,R11
+ ANDN R3,R11,R11
+ POPCNTD R11,R11
+#else
+ VSLDOI $6,V6,V6,V6
+ MFVRD V6,R3
+ CNTLZD R3,R11
+#endif
+ ADD R8,R11,R3
+ CMPU R11,R4
+ BLT return
+ BR notfound
+ PCALIGN $16
+
+done:
+ ADD $-1,R10,R6
+ // Offset of last index for the final
+ // doubleword comparison
+ RLDICL $0,R6,$61,R6
+ // At this point, R3 has 0xFF in the same position as the byte we are
+ // looking for in the doubleword. Use that to calculate the exact index
+ // of the byte.
+#ifdef GOARCH_ppc64le
+ ADD $-1,R3,R11
+ ANDN R3,R11,R11
+ POPCNTD R11,R11 // Count trailing zeros (Little Endian).
+#else
+ CNTLZD R3,R11 // Count leading zeros (Big Endian).
+#endif
+ CMPU R8,R7 // Check if we are at the last doubleword.
+ SRD $3,R11 // Convert trailing zeros to bytes.
+ ADD R11,R8,R3
+ CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
+ BNE return
+ BLE CR7,return
+ BR notfound
+
+small_string:
+ // process string of length < 32 bytes
+ // We unroll this loop for better performance.
+ CMPU R4,$0 // Check for length=0
+ BEQ notfound
+
+ MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
+ CMPB R12,R5,R3 // Check for a match.
+ AND R9,R3,R3 // Mask bytes below s_base.
+ CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
+ RLDICR $0,R7,$60,R7 // Last doubleword in R7.
+ CMPU R8,R7
+ BNE CR7,done
+ BEQ notfound // Hit length.
+
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ CMPU R8,R7
+ BNE CR6,done
+ BEQ notfound
+
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ CMPU R8,R7
+ BNE CR6,done
+ BEQ notfound
+
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ CMPU R8,R7
+ BNE CR6,done
+ BEQ notfound
+
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ BNE CR6,done
+ BR notfound
+