summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg/index_ppc64x.s
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:19:13 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:19:13 +0000
commitccd992355df7192993c666236047820244914598 (patch)
treef00fea65147227b7743083c6148396f74cd66935 /src/internal/bytealg/index_ppc64x.s
parentInitial commit. (diff)
downloadgolang-1.21-ccd992355df7192993c666236047820244914598.tar.xz
golang-1.21-ccd992355df7192993c666236047820244914598.zip
Adding upstream version 1.21.8.upstream/1.21.8
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/internal/bytealg/index_ppc64x.s')
-rw-r--r--src/internal/bytealg/index_ppc64x.s841
1 files changed, 841 insertions, 0 deletions
diff --git a/src/internal/bytealg/index_ppc64x.s b/src/internal/bytealg/index_ppc64x.s
new file mode 100644
index 0000000..80a1f85
--- /dev/null
+++ b/src/internal/bytealg/index_ppc64x.s
@@ -0,0 +1,841 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This is an implementation based on the s390x
+// implementation.
+
+// Find a separator with 2 <= len <= 32 within a string.
+// Separators with lengths of 2, 3 or 4 are handled
+// specially.
+
+// This works on power8 and above. The loads and
+// compares are done in big endian order
+// since that allows the used of VCLZD, and allows
+// the same implementation to work on big and little
+// endian platforms with minimal conditional changes.
+
+// NOTE: There is a power9 implementation that
+// improves performance by 10-15% on little
+// endian for some of the benchmarks.
+// Unrolled index2to16 loop by 4 on ppc64le/power9
+// Work is still needed for a big endian
+// implementation on power9.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Needed to swap LXVD2X loads to the correct
+// byte order to work on POWER8.
+
+#ifdef GOARCH_ppc64
+DATA byteswap<>+0(SB)/8, $0x0001020304050607
+DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f
+#else
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+#endif
+
+// Load bytes in big endian order. Address
+// alignment does not need checking.
+#define VLOADSWAP(base, index, vreg, vsreg) \
+ LXVD2X (base)(index), vsreg; \
+ VPERM vreg, vreg, SWAP, vreg
+
+GLOBL byteswap<>+0(SB), RODATA, $16
+
+TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+ // R3 = byte array pointer
+ // R4 = length
+ MOVD R6, R5 // R5 = separator pointer
+ MOVD R7, R6 // R6 = separator length
+
+#ifdef GOARCH_ppc64le
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
+ CMP R7, $1
+ BNE power8
+ BR indexbodyp9<>(SB)
+#endif
+power8:
+ BR indexbody<>(SB)
+
+TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+ // R3 = string
+ // R4 = length
+ // R5 = separator pointer
+ // R6 = separator length
+
+#ifdef GOARCH_ppc64le
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
+ CMP R7, $1
+ BNE power8
+ BR indexbodyp9<>(SB)
+
+#endif
+power8:
+ BR indexbody<>(SB)
+
+ // s: string we are searching
+ // sep: string to search for
+ // R3=&s[0], R4=len(s)
+ // R5=&sep[0], R6=len(sep)
+ // R14=&ret (index where sep found)
+ // R7=working addr of string
+ // R16=index value 16
+ // R17=index value 17
+ // R18=index value 18
+ // R19=index value 1
+ // R26=LASTBYTE of string
+ // R27=LASTSTR last start byte to compare with sep
+ // R8, R9 scratch
+ // V0=sep left justified zero fill
+ // CR4=sep length >= 16
+
+#define SEPMASK V17
+#define LASTBYTE R26
+#define LASTSTR R27
+#define ONES V20
+#define SWAP V21
+#define SWAP_ VS53
+TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
+ CMP R6, R4 // Compare lengths
+ BGT notfound // If sep len is > string, notfound
+ ADD R4, R3, LASTBYTE // find last byte addr
+ SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
+ CMP R6, $0 // Check sep len
+ BEQ notfound // sep len 0 -- not found
+ MOVD R3, R7 // Copy of string addr
+ MOVD $16, R16 // Index value 16
+ MOVD $17, R17 // Index value 17
+ MOVD $18, R18 // Index value 18
+ MOVD $1, R19 // Index value 1
+ MOVD $byteswap<>+00(SB), R8
+ VSPLTISB $0xFF, ONES // splat all 1s
+ LXVD2X (R8)(R0), SWAP_ // Set up swap string
+
+ CMP R6, $16, CR4 // CR4 for len(sep) >= 16
+ VOR ONES, ONES, SEPMASK // Set up full SEPMASK
+ BGE CR4, loadge16 // Load for len(sep) >= 16
+ SUB R6, R16, R9 // 16-len of sep
+ SLD $3, R9 // Set up for VSLO
+ MTVSRD R9, V9 // Set up for VSLO
+ VSLDOI $8, V9, V9, V9 // Set up for VSLO
+ VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
+
+loadge16:
+ ANDCC $15, R5, R9 // Find byte offset of sep
+ ADD R9, R6, R10 // Add sep len
+ CMP R10, $16 // Check if sep len+offset > 16
+ BGT sepcross16 // Sep crosses 16 byte boundary
+
+ RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
+ VLOADSWAP(R8, R0, V0, V0) // Load 16 bytes @R8 into V0
+ SLD $3, R9 // Set up shift count for VSLO
+ MTVSRD R9, V8 // Set up shift count for VSLO
+ VSLDOI $8, V8, V8, V8
+ VSLO V0, V8, V0 // Shift by start byte
+
+ VAND V0, SEPMASK, V0 // Mask separator (< 16)
+ BR index2plus
+
+sepcross16:
+ VLOADSWAP(R5, R0, V0, V0) // Load 16 bytes @R5 into V0
+
+ VAND V0, SEPMASK, V0 // mask out separator
+ BLE CR4, index2to16
+ BR index17plus // Handle sep > 16
+
+index2plus:
+ CMP R6, $2 // Check length of sep
+ BNE index3plus // If not 2, check for 3
+ ADD $16, R7, R9 // Check if next 16 bytes past last
+ CMP R9, LASTBYTE // compare with last
+ BGE index2to16 // 2 <= len(string) <= 16
+ MOVD $0xff00, R21 // Mask for later
+ MTVSRD R21, V25 // Move to Vreg
+ VSPLTH $3, V25, V31 // Splat mask
+ VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep
+ VSPLTISB $0, V10 // Clear V10
+
+ // First case: 2 byte separator
+ // V1: 2 byte separator splatted
+ // V2: 16 bytes at addr
+ // V4: 16 bytes at addr+1
+ // Compare 2 byte separator at start
+ // and at start+1. Use VSEL to combine
+ // those results to find the first
+ // matching start byte, returning
+ // that value when found. Loop as
+ // long as len(string) > 16
+index2loop2:
+ VLOADSWAP(R7, R19, V3, V3) // Load 16 bytes @R7+1 into V3
+
+index2loop:
+ VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2
+ VCMPEQUH V1, V2, V5 // Search for sep
+ VCMPEQUH V1, V3, V6 // Search for sep offset by 1
+ VSEL V6, V5, V31, V7 // merge even and odd indices
+ VCLZD V7, V18 // find index of first match
+ MFVSRD V18, R25 // get first value
+ CMP R25, $64 // Found if < 64
+ BLT foundR25 // Return byte index where found
+ VSLDOI $8, V18, V18, V18 // Adjust 2nd value
+ MFVSRD V18, R25 // get second value
+ CMP R25, $64 // Found if < 64
+ ADD $64, R25 // Update byte offset
+ BLT foundR25 // Return value
+ ADD $16, R7 // R7+=16 Update string pointer
+ ADD $17, R7, R9 // R9=F7+17 since loop unrolled
+ CMP R9, LASTBYTE // Compare addr+17 against last byte
+ BLT index2loop2 // If < last, continue loop
+ CMP R7, LASTBYTE // Compare addr+16 against last byte
+ BLT index2to16 // If < 16 handle specially
+ VLOADSWAP(R7, R0, V3, V3) // Load 16 bytes @R7 into V3
+ VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
+ BR index2loop
+
+index3plus:
+ CMP R6, $3 // Check if sep == 3
+ BNE index4plus // If not check larger
+ ADD $19, R7, R9 // Find bytes for use in this loop
+ CMP R9, LASTBYTE // Compare against last byte
+ BGE index2to16 // Remaining string 2<=len<=16
+ MOVD $0xff00, R21 // Set up mask for upcoming loop
+ MTVSRD R21, V25 // Move mask to Vreg
+ VSPLTH $3, V25, V31 // Splat mask
+ VSPLTH $0, V0, V1 // Splat 1st two bytes of sep
+ VSPLTB $2, V0, V8 // Splat 3rd byte of sep
+
+ // Loop to process 3 byte separator.
+ // string[0:16] is in V2
+ // string[2:18] is in V3
+ // sep[0:2] splatted in V1
+ // sec[3] splatted in v8
+ // Load vectors at string, string+1
+ // and string+2. Compare string, string+1
+ // against first 2 bytes of separator
+ // splatted, and string+2 against 3rd
+ // byte splatted. Merge the results with
+ // VSEL to find the first byte of a match.
+
+ // Special handling for last 16 bytes if the
+ // string fits in 16 byte multiple.
+index3loop2:
+ MOVD $2, R21 // Set up index for 2
+ VSPLTISB $0, V10 // Clear V10
+ VLOADSWAP(R7, R21, V3, V3)// Load 16 bytes @R7+2 into V3
+ VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
+
+index3loop:
+ VLOADSWAP(R7, R0, V2, V2) // Load with correct order
+ VSLDOI $1, V2, V3, V4 // string[1:17]
+ VSLDOI $2, V2, V3, V9 // string[2:18]
+ VCMPEQUH V1, V2, V5 // compare hw even indices
+ VCMPEQUH V1, V4, V6 // compare hw odd indices
+ VCMPEQUB V8, V9, V10 // compare 3rd to last byte
+ VSEL V6, V5, V31, V7 // Find 1st matching byte using mask
+ VAND V7, V10, V7 // AND matched bytes with matched 3rd byte
+ VCLZD V7, V18 // Find first nonzero indexes
+ MFVSRD V18, R25 // Move 1st doubleword
+ CMP R25, $64 // If < 64 found
+ BLT foundR25 // Return matching index
+ VSLDOI $8, V18, V18, V18 // Move value
+ MFVSRD V18, R25 // Move 2nd doubleword
+ CMP R25, $64 // If < 64 found
+ ADD $64, R25 // Update byte index
+ BLT foundR25 // Return matching index
+ ADD $16, R7 // R7+=16 string ptr
+ ADD $19, R7, R9 // Number of string bytes for loop
+ CMP R9, LASTBYTE // Compare against last byte of string
+ BLT index3loop2 // If within, continue this loop
+ CMP R7, LASTSTR // Compare against last start byte
+ BLT index2to16 // Process remainder
+ VSPLTISB $0, V3 // Special case for last 16 bytes
+ BR index3loop // Continue this loop
+
+ // Loop to process 4 byte separator
+ // string[0:16] in V2
+ // string[3:16] in V3
+ // sep[0:4] splatted in V1
+ // Set up vectors with strings at offsets
+ // 0, 1, 2, 3 and compare against the 4 byte
+ // separator also splatted. Use VSEL with the
+ // compare results to find the first byte where
+ // a separator match is found.
+index4plus:
+ CMP R6, $4 // Check if 4 byte separator
+ BNE index5plus // If not next higher
+ ADD $20, R7, R9 // Check string size to load
+ CMP R9, LASTBYTE // Verify string length
+ BGE index2to16 // If not large enough, process remaining
+ MOVD $2, R15 // Set up index
+
+ // Set up masks for use with VSEL
+ MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
+ SLD $24, R21
+ MTVSRD R21, V10
+ VSPLTW $1, V10, V29
+ VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
+ MOVD $0xffff, R21
+ SLD $16, R21
+ MTVSRD R21, V10
+ VSPLTW $1, V10, V31 // Mask 0xffff0000ffff0000...
+ VSPLTW $0, V0, V1 // Splat 1st word of separator
+
+index4loop:
+ VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2
+
+next4:
+ VSPLTISB $0, V10 // Clear
+ MOVD $3, R9 // Number of bytes beyond 16
+ VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+3 into V3
+ VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
+ VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
+ VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
+ VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3
+ VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep
+ VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep
+ VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep
+ VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep
+ VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask
+ VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
+ VSEL V14, V13, V31, V7 // final merge
+ VCLZD V7, V18 // Find first index for each half
+ MFVSRD V18, R25 // Isolate value
+ CMP R25, $64 // If < 64, found
+ BLT foundR25 // Return found index
+ VSLDOI $8, V18, V18, V18 // Move for MFVSRD
+ MFVSRD V18, R25 // Isolate other value
+ CMP R25, $64 // If < 64, found
+ ADD $64, R25 // Update index for high doubleword
+ BLT foundR25 // Return found index
+ ADD $16, R7 // R7+=16 for next string
+ ADD $20, R7, R9 // R+20 for all bytes to load
+ CMP R9, LASTBYTE // Past end? Maybe check for extra?
+ BLT index4loop // If not, continue loop
+ CMP R7, LASTSTR // Check remainder
+ BLE index2to16 // Process remainder
+ BR notfound // Not found
+
+index5plus:
+ CMP R6, $16 // Check for sep > 16
+ BGT index17plus // Handle large sep
+
+ // Assumption is that the separator is smaller than the string at this point
+index2to16:
+ CMP R7, LASTSTR // Compare last start byte
+ BGT notfound // last takes len(sep) into account
+
+ ADD $16, R7, R9 // Check for last byte of string
+ CMP R9, LASTBYTE
+ BGT index2to16tail
+
+ // At least 16 bytes of string left
+ // Mask the number of bytes in sep
+index2to16loop:
+ VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1
+
+compare:
+ VAND V1, SEPMASK, V2 // Mask out sep size
+ VCMPEQUBCC V0, V2, V3 // Compare masked string
+ BLT CR6, found // All equal
+ ADD $1, R7 // Update ptr to next byte
+ CMP R7, LASTSTR // Still less than last start byte
+ BGT notfound // Not found
+ ADD $16, R7, R9 // Verify remaining bytes
+ CMP R9, LASTBYTE // At least 16
+ BLT index2to16loop // Try again
+
+ // Less than 16 bytes remaining in string
+ // Separator >= 2
+index2to16tail:
+ ADD R3, R4, R9 // End of string
+ SUB R7, R9, R9 // Number of bytes left
+ ANDCC $15, R7, R10 // 16 byte offset
+ ADD R10, R9, R11 // offset + len
+ CMP R11, $16 // >= 16?
+ BLE short // Does not cross 16 bytes
+ VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1
+ BR index2to16next // Continue on
+
+short:
+ RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
+ VLOADSWAP(R9, R0, V1, V1)// Load 16 bytes @R9 into V1
+ SLD $3, R10 // Set up shift
+ MTVSRD R10, V8 // Set up shift
+ VSLDOI $8, V8, V8, V8
+ VSLO V1, V8, V1 // Shift by start byte
+ VSPLTISB $0, V25 // Clear for later use
+
+index2to16next:
+ VAND V1, SEPMASK, V2 // Just compare size of sep
+ VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
+ BLT CR6, found // Found
+ ADD $1, R7 // Not found, try next partial string
+ CMP R7, LASTSTR // Check for end of string
+ BGT notfound // If at end, then not found
+ VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
+ BR index2to16next // Check the next partial string
+
+index17plus:
+ CMP R6, $32 // Check if 17 < len(sep) <= 32
+ BGT index33plus
+ SUB $16, R6, R9 // Extra > 16
+ SLD $56, R9, R10 // Shift to use in VSLO
+ MTVSRD R10, V9 // Set up for VSLO
+ VLOADSWAP(R5, R9, V1, V1)// Load 16 bytes @R5+R9 into V1
+ VSLO V1, V9, V1 // Shift left
+ VSPLTISB $0xff, V7 // Splat 1s
+ VSPLTISB $0, V27 // Splat 0
+
+index17to32loop:
+ VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2
+
+next17:
+ VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+R9 into V3
+ VSLO V3, V9, V3 // Shift left
+ VCMPEQUB V0, V2, V4 // Compare first 16 bytes
+ VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
+ VAND V4, V5, V6 // Check if both equal
+ VCMPEQUBCC V6, V7, V8 // All equal?
+ BLT CR6, found // Yes
+ ADD $1, R7 // On to next byte
+ CMP R7, LASTSTR // Check if last start byte
+ BGT notfound // If too high, not found
+ BR index17to32loop // Continue
+
+notfound:
+ MOVD $-1, R3 // Return -1 if not found
+ RET
+
+index33plus:
+ MOVD $0, (R0) // Case not implemented
+ RET // Crash before return
+
+foundR25:
+ SRD $3, R25 // Convert from bits to bytes
+ ADD R25, R7 // Add to current string address
+ SUB R3, R7 // Subtract from start of string
+ MOVD R7, R3 // Return byte where found
+ RET
+
+found:
+ SUB R3, R7 // Return byte where found
+ MOVD R7, R3
+ RET
+
+TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
+ CMP R6, R4 // Compare lengths
+ BGT notfound // If sep len is > string, notfound
+ ADD R4, R3, LASTBYTE // find last byte addr
+ SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
+ CMP R6, $0 // Check sep len
+ BEQ notfound // sep len 0 -- not found
+ MOVD R3, R7 // Copy of string addr
+#ifndef GOPPC64_power10
+ MOVD $16, R16 // Index value 16
+ MOVD $17, R17 // Index value 17
+ MOVD $18, R18 // Index value 18
+ VSPLTISB $0xFF, ONES // splat all 1s
+ VOR ONES, ONES, SEPMASK // Set up full SEPMASK
+#else
+ SLD $56, R6, R14 // Set up separator length for LXVLL
+#endif
+ MOVD $1, R19 // Index value 1
+ CMP R6, $16, CR4 // CR4 for len(sep) >= 16
+ BGE CR4, loadge16 // Load for len(sep) >= 16
+#ifndef GOPPC64_power10
+ SUB R6, R16, R9 // 16-len of sep
+ SLD $3, R9 // Set up for VSLO
+ MTVSRD R9, V9 // Set up for VSLO
+ VSLDOI $8, V9, V9, V9 // Set up for VSLO
+ VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
+#endif
+loadge16:
+ ANDCC $15, R5, R9 // Find byte offset of sep
+ ADD R9, R6, R10 // Add sep len
+ CMP R10, $16 // Check if sep len+offset > 16
+ BGT sepcross16 // Sep crosses 16 byte boundary
+#ifdef GOPPC64_power10
+ LXVLL R5, R14, V0 // Load separator
+#else
+ RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
+ LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0
+ SLD $3, R9 // Set up shift count for VSLO
+ MTVSRD R9, V8 // Set up shift count for VSLO
+ VSLDOI $8, V8, V8, V8
+ VSLO V0, V8, V0 // Shift by start byte
+ VAND V0, SEPMASK, V0 // Mask separator (< 16)
+#endif
+ BR index2plus
+sepcross16:
+#ifdef GOPPC64_power10
+ LXVLL R5, R14, V0 // Load separator
+#else
+ LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0\
+ VAND V0, SEPMASK, V0 // mask out separator
+#endif
+ BLE CR4, index2to16
+ BR index17plus // Handle sep > 16
+
+index2plus:
+ CMP R6, $2 // Check length of sep
+ BNE index3plus // If not 2, check for 3
+ ADD $16, R7, R9 // Check if next 16 bytes past last
+ CMP R9, LASTBYTE // compare with last
+ BGE index2to16 // 2 <= len(string) <= 16
+ MOVD $0xff00, R21 // Mask for later
+ MTVSRD R21, V25 // Move to Vreg
+ VSPLTH $3, V25, V31 // Splat mask
+ VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep
+ VSPLTISB $0, V10 // Clear V10
+
+ // First case: 2 byte separator
+ // V1: 2 byte separator splatted
+ // V2: 16 bytes at addr
+ // V4: 16 bytes at addr+1
+ // Compare 2 byte separator at start
+ // and at start+1. Use VSEL to combine
+ // those results to find the first
+ // matching start byte, returning
+ // that value when found. Loop as
+ // long as len(string) > 16
+index2loop2:
+ LXVB16X (R7)(R19), V3 // Load 16 bytes @R7+1 into V3
+
+index2loop:
+ LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2
+ VCMPEQUH V1, V2, V5 // Search for sep
+ VCMPEQUH V1, V3, V6 // Search for sep offset by 1
+ VSEL V6, V5, V31, V7 // merge even and odd indices
+ VCLZD V7, V18 // find index of first match
+ MFVSRD V18, R25 // get first value
+ CMP R25, $64 // Found if < 64
+ BLT foundR25 // Return byte index where found
+
+ MFVSRLD V18, R25 // get second value
+ CMP R25, $64 // Found if < 64
+ ADD $64, R25 // Update byte offset
+ BLT foundR25 // Return value
+ ADD $16, R7 // R7+=16 Update string pointer
+ ADD $17, R7, R9 // R9=F7+17 since loop unrolled
+ CMP R9, LASTBYTE // Compare addr+17 against last byte
+ BLT index2loop2 // If < last, continue loop
+ CMP R7, LASTBYTE // Compare addr+16 against last byte
+ BLT index2to16 // If < 16 handle specially
+ LXVB16X (R7)(R0), V3 // Load 16 bytes @R7 into V3
+ VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
+ BR index2loop
+
+index3plus:
+ CMP R6, $3 // Check if sep == 3
+ BNE index4plus // If not check larger
+ ADD $19, R7, R9 // Find bytes for use in this loop
+ CMP R9, LASTBYTE // Compare against last byte
+ BGE index2to16 // Remaining string 2<=len<=16
+ MOVD $0xff00, R21 // Set up mask for upcoming loop
+ MTVSRD R21, V25 // Move mask to Vreg
+ VSPLTH $3, V25, V31 // Splat mask
+ VSPLTH $0, V0, V1 // Splat 1st two bytes of sep
+ VSPLTB $2, V0, V8 // Splat 3rd byte of sep
+
+ // Loop to process 3 byte separator.
+ // string[0:16] is in V2
+ // string[2:18] is in V3
+ // sep[0:2] splatted in V1
+ // sec[3] splatted in v8
+ // Load vectors at string, string+1
+ // and string+2. Compare string, string+1
+ // against first 2 bytes of separator
+ // splatted, and string+2 against 3rd
+ // byte splatted. Merge the results with
+ // VSEL to find the first byte of a match.
+
+ // Special handling for last 16 bytes if the
+ // string fits in 16 byte multiple.
+index3loop2:
+ MOVD $2, R21 // Set up index for 2
+ VSPLTISB $0, V10 // Clear V10
+ LXVB16X (R7)(R21), V3 // Load 16 bytes @R7+2 into V3
+ VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
+
+index3loop:
+ LXVB16X (R7)(R0), V2 // Load 16 bytes @R7
+ VSLDOI $1, V2, V3, V4 // string[1:17]
+ VSLDOI $2, V2, V3, V9 // string[2:18]
+ VCMPEQUH V1, V2, V5 // compare hw even indices
+ VCMPEQUH V1, V4, V6 // compare hw odd indices
+ VCMPEQUB V8, V9, V10 // compare 3rd to last byte
+ VSEL V6, V5, V31, V7 // Find 1st matching byte using mask
+ VAND V7, V10, V7 // AND matched bytes with matched 3rd byte
+ VCLZD V7, V18 // Find first nonzero indexes
+ MFVSRD V18, R25 // Move 1st doubleword
+ CMP R25, $64 // If < 64 found
+ BLT foundR25 // Return matching index
+
+ MFVSRLD V18, R25 // Move 2nd doubleword
+ CMP R25, $64 // If < 64 found
+ ADD $64, R25 // Update byte index
+ BLT foundR25 // Return matching index
+ ADD $16, R7 // R7+=16 string ptr
+ ADD $19, R7, R9 // Number of string bytes for loop
+ CMP R9, LASTBYTE // Compare against last byte of string
+ BLT index3loop2 // If within, continue this loop
+ CMP R7, LASTSTR // Compare against last start byte
+ BLT index2to16 // Process remainder
+ VSPLTISB $0, V3 // Special case for last 16 bytes
+ BR index3loop // Continue this loop
+
+ // Loop to process 4 byte separator
+ // string[0:16] in V2
+ // string[3:16] in V3
+ // sep[0:4] splatted in V1
+ // Set up vectors with strings at offsets
+ // 0, 1, 2, 3 and compare against the 4 byte
+ // separator also splatted. Use VSEL with the
+ // compare results to find the first byte where
+ // a separator match is found.
+index4plus:
+ CMP R6, $4 // Check if 4 byte separator
+ BNE index5plus // If not next higher
+ ADD $20, R7, R9 // Check string size to load
+ CMP R9, LASTBYTE // Verify string length
+ BGE index2to16 // If not large enough, process remaining
+
+ // Set up masks for use with VSEL
+ MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
+ SLD $24, R21
+ MTVSRWS R21, V29
+
+ VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
+ MOVD $0xffff, R21
+ SLD $16, R21
+ MTVSRWS R21, V31
+
+ VSPLTW $0, V0, V1 // Splat 1st word of separator
+
+index4loop:
+ LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2
+
+next4:
+ VSPLTISB $0, V10 // Clear
+ MOVD $3, R9 // Number of bytes beyond 16
+ LXVB16X (R7)(R9), V3 // Load 16 bytes @R7 into V3
+ VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
+ VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
+ VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
+ VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3
+ VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep
+ VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep
+ VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep
+ VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep
+ VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask
+ VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
+ VSEL V14, V13, V31, V7 // final merge
+ VCLZD V7, V18 // Find first index for each half
+ MFVSRD V18, R25 // Isolate value
+ CMP R25, $64 // If < 64, found
+ BLT foundR25 // Return found index
+
+ MFVSRLD V18, R25 // Isolate other value
+ CMP R25, $64 // If < 64, found
+ ADD $64, R25 // Update index for high doubleword
+ BLT foundR25 // Return found index
+ ADD $16, R7 // R7+=16 for next string
+ ADD $20, R7, R9 // R+20 for all bytes to load
+ CMP R9, LASTBYTE // Past end? Maybe check for extra?
+ BLT index4loop // If not, continue loop
+ CMP R7, LASTSTR // Check remainder
+ BLE index2to16 // Process remainder
+ BR notfound // Not found
+
+index5plus:
+ CMP R6, $16 // Check for sep > 16
+ BGT index17plus // Handle large sep
+
+ // Assumption is that the separator is smaller than the string at this point
+index2to16:
+ CMP R7, LASTSTR // Compare last start byte
+ BGT notfound // last takes len(sep) into account
+
+ ADD $19, R7, R9 // To check 4 indices per iteration, need at least 16+3 bytes
+ CMP R9, LASTBYTE
+ // At least 16 bytes of string left
+ // Mask the number of bytes in sep
+ VSPLTISB $0, V10 // Clear
+ BGT index2to16tail
+
+#ifdef GOPPC64_power10
+ ADD $3,R7, R17 // Base+3
+ ADD $2,R7, R8 // Base+2
+ ADD $1,R7, R10 // Base+1
+#else
+ MOVD $3, R17 // Number of bytes beyond 16
+#endif
+ PCALIGN $16
+
+index2to16loop:
+
+#ifdef GOPPC64_power10
+ LXVLL R7, R14, V8 // Load next 16 bytes of string from Base
+ LXVLL R10, R14, V9 // Load next 16 bytes of string from Base+1
+ LXVLL R8, R14, V11 // Load next 16 bytes of string from Base+2
+ LXVLL R17,R14, V12 // Load next 16 bytes of string from Base+3
+#else
+ LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7
+ LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3
+
+ VSLDOI $13, V5, V10, V2 // Shift left last 3 bytes
+ VSLDOI $1, V1, V2, V3 // V3=(V1:V2)<<1
+ VSLDOI $2, V1, V2, V4 // V4=(V1:V2)<<2
+ VAND V1, SEPMASK, V8 // Mask out sep size 0th index
+ VAND V3, SEPMASK, V9 // Mask out sep size 1st index
+ VAND V4, SEPMASK, V11 // Mask out sep size 2nd index
+ VAND V5, SEPMASK, V12 // Mask out sep size 3rd index
+#endif
+ VCMPEQUBCC V0, V8, V8 // compare masked string
+ BLT CR6, found // All equal while comparing 0th index
+ VCMPEQUBCC V0, V9, V9 // compare masked string
+ BLT CR6, found2 // All equal while comparing 1st index
+ VCMPEQUBCC V0, V11, V11 // compare masked string
+ BLT CR6, found3 // All equal while comparing 2nd index
+ VCMPEQUBCC V0, V12, V12 // compare masked string
+ BLT CR6, found4 // All equal while comparing 3rd index
+
+ ADD $4, R7 // Update ptr to next 4 bytes
+#ifdef GOPPC64_power10
+ ADD $4, R17 // Update ptr to next 4 bytes
+ ADD $4, R8 // Update ptr to next 4 bytes
+ ADD $4, R10 // Update ptr to next 4 bytes
+#endif
+ CMP R7, LASTSTR // Still less than last start byte
+ BGT notfound // Not found
+ ADD $19, R7, R9 // Verify remaining bytes
+ CMP R9, LASTBYTE // length of string at least 19
+ BLE index2to16loop // Try again, else do post processing and jump to index2to16next
+ PCALIGN $32
+ // <19 bytes left, post process the remaining string
+index2to16tail:
+#ifdef GOPPC64_power10
+index2to16next_p10:
+ LXVLL R7,R14, V1 // Load 16 bytes @R7 into V1
+ VCMPEQUBCC V1, V0, V3 // Compare sep and partial string
+ BLT CR6, found // Found
+ ADD $1, R7 // Not found, try next partial string
+ CMP R7, LASTSTR // Check for end of string
+ BLE index2to16next_p10 // If at end, then not found
+ BR notfound // go to remainder loop
+#else
+ ADD R3, R4, R9 // End of string
+ SUB R7, R9, R9 // Number of bytes left
+ ANDCC $15, R7, R10 // 16 byte offset
+ ADD R10, R9, R11 // offset + len
+ CMP R11, $16 // >= 16?
+ BLE short // Does not cross 16 bytes
+ LXVB16X (R7)(R0), V1 // Load 16 bytes @R7 into V1
+ CMP R9, $16 // Post-processing of unrolled loop
+ BLE index2to16next // continue to index2to16next if <= 16 bytes
+ SUB R16, R9, R10 // R9 should be 18 or 17 hence R10 is 1 or 2
+ LXVB16X (R7)(R10), V9
+ CMP R10, $1 // string length is 17, compare 1 more byte
+ BNE extra2 // string length is 18, compare 2 more bytes
+ VSLDOI $15, V9, V10, V25
+ VAND V1, SEPMASK, V2 // Just compare size of sep
+ VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
+ BLT CR6, found // Found
+ ADD $1, R7 // Not found, try next partial string
+ CMP R7, LASTSTR // Check for end of string
+ BGT notfound // If at end, then not found
+ VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
+ BR index2to16next // go to remainder loop
+extra2:
+ VSLDOI $14, V9, V10, V25
+ VAND V1, SEPMASK, V2 // Just compare size of sep
+ VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
+ BLT CR6, found // Found
+ ADD $1, R7 // Not found, try next partial string
+ CMP R7, LASTSTR // Check for end of string
+ BGT notfound // If at end, then not found
+ VOR V1, V1, V4 // save remaining string
+ VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte for 17th byte
+ VAND V1, SEPMASK, V2 // Just compare size of sep
+ VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
+ BLT CR6, found // Found
+ ADD $1, R7 // Not found, try next partial string
+ CMP R7, LASTSTR // Check for end of string
+ BGT notfound // If at end, then not found
+ VSLDOI $2, V4, V25, V1 // Shift saved string left by 2 bytes for 18th byte
+ BR index2to16next // Check the remaining partial string in index2to16next
+
+short:
+ RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
+ LXVB16X (R9)(R0), V1 // Load 16 bytes @R9 into V1
+ SLD $3, R10 // Set up shift
+ MTVSRD R10, V8 // Set up shift
+ VSLDOI $8, V8, V8, V8
+ VSLO V1, V8, V1 // Shift by start byte
+ PCALIGN $16
+index2to16next:
+ VAND V1, SEPMASK, V2 // Just compare size of sep
+ VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
+ BLT CR6, found // Found
+ ADD $1, R7 // Not found, try next partial string
+ CMP R7, LASTSTR // Check for end of string
+ BGT notfound // If at end, then not found
+ VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte
+ BR index2to16next // Check the next partial string
+#endif // Tail processing if GOPPC64!=power10
+
+index17plus:
+ CMP R6, $32 // Check if 17 < len(sep) <= 32
+ BGT index33plus
+ SUB $16, R6, R9 // Extra > 16
+ SLD $56, R9, R10 // Shift to use in VSLO
+ MTVSRD R10, V9 // Set up for VSLO
+ LXVB16X (R5)(R9), V1 // Load 16 bytes @R5+R9 into V1
+ VSLO V1, V9, V1 // Shift left
+ VSPLTISB $0xff, V7 // Splat 1s
+ VSPLTISB $0, V27 // Splat 0
+
+index17to32loop:
+ LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2
+
+next17:
+ LXVB16X (R7)(R9), V3 // Load 16 bytes @R7+R9 into V3
+ VSLO V3, V9, V3 // Shift left
+ VCMPEQUB V0, V2, V4 // Compare first 16 bytes
+ VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
+ VAND V4, V5, V6 // Check if both equal
+ VCMPEQUBCC V6, V7, V8 // All equal?
+ BLT CR6, found // Yes
+ ADD $1, R7 // On to next byte
+ CMP R7, LASTSTR // Check if last start byte
+ BGT notfound // If too high, not found
+ BR index17to32loop // Continue
+
+notfound:
+ MOVD $-1, R3 // Return -1 if not found
+ RET
+
+index33plus:
+ MOVD $0, (R0) // Case not implemented
+ RET // Crash before return
+
+foundR25:
+ SRD $3, R25 // Convert from bits to bytes
+ ADD R25, R7 // Add to current string address
+ SUB R3, R7 // Subtract from start of string
+ MOVD R7, R3 // Return byte where found
+ RET
+found4:
+ ADD $1, R7 // found from unrolled loop at index 3
+found3:
+ ADD $1, R7 // found from unrolled loop at index 2
+found2:
+ ADD $1, R7 // found from unrolled loop at index 1
+found: // found at index 0
+ SUB R3, R7 // Return byte where found
+ MOVD R7, R3
+ RET