From 43a123c1ae6613b3efeed291fa552ecd909d3acf Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Tue, 16 Apr 2024 21:23:18 +0200 Subject: Adding upstream version 1.20.14. Signed-off-by: Daniel Baumann --- src/internal/bytealg/indexbyte_ppc64x.s | 391 ++++++++++++++++++++++++++++++++ 1 file changed, 391 insertions(+) create mode 100644 src/internal/bytealg/indexbyte_ppc64x.s (limited to 'src/internal/bytealg/indexbyte_ppc64x.s') diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s new file mode 100644 index 0000000..1a6e852 --- /dev/null +++ b/src/internal/bytealg/indexbyte_ppc64x.s @@ -0,0 +1,391 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ppc64 || ppc64le + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 + // R3 = byte array pointer + // R4 = length + MOVD R6, R5 // R5 = byte + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 + BR indexbytebody<>(SB) + +TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32 + // R3 = string + // R4 = length + // R5 = byte + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 + BR indexbytebody<>(SB) + +// R3 = addr of string +// R4 = len of string +// R5 = byte to find +// R16 = 1 if running on a POWER9 system, 0 otherwise +// On exit: +// R3 = return value +TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 + MOVD R3,R17 // Save base address for calculating the index later. + RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. + RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. + ADD R4,R3,R7 // Last acceptable address in R7. + + RLDIMI $16,R5,$32,R5 + CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently. + MOVD $-1,R9 + RLWNM $3,R3,$26,$28,R6 // shift amount for mask (r3&0x7)*8 + RLDIMI $32,R5,$0,R5 + MOVD R7,R10 // Save last acceptable address in R10 for later. + ADD $-1,R7,R7 +#ifdef GOARCH_ppc64le + SLD R6,R9,R9 // Prepare mask for Little Endian +#else + SRD R6,R9,R9 // Same for Big Endian +#endif + BLT small_string // Jump to the small string case if it's <32 bytes. + CMP R16,$1 // optimize for power8 v power9 + BNE power8 + VSPLTISB $3,V10 // Use V10 as control for VBPERMQ + MTVRD R5,V1 + LVSL (R0+R0),V11 // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0} + VSLB V11,V10,V10 // to extract the first bit of match result into GPR + VSPLTB $7,V1,V1 // Replicate byte across V1 + CMP R4,$64 + MOVD $16,R11 + MOVD R3,R8 + BLT cmp32 + MOVD $32,R12 + MOVD $48,R6 + +loop64: + LXVB16X (R0)(R8),V2 // scan 64 bytes at a time + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // match found at R8, jump out + + LXVB16X (R8)(R11),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat1 // match found at R8+16 bytes, jump out + + LXVB16X (R8)(R12),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat2 // match found at R8+32 bytes, jump out + + LXVB16X (R8)(R6),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat3 // match found at R8+48 bytes, jump out + ADD $64,R8 + ADD $-64,R4 + CMP R4,$64 // >=64 bytes left to scan? + BGE loop64 + CMP R4,$32 + BLT rem // jump to rem if there are < 32 bytes left +cmp32: + LXVB16X (R0)(R8),V2 // 32-63 bytes left + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // match found at R8 + + LXVB16X (R11)(R8),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat1 // match found at R8+16 + + ADD $32,R8 + ADD $-32,R4 +rem: + RLDICR $0,R8,$60,R8 // align address to reuse code for tail end processing + BR small_string + +foundat3: + ADD $16,R8 +foundat2: + ADD $16,R8 +foundat1: + ADD $16,R8 +foundat0: + // Compress the result into a single doubleword and + // move it to a GPR for the final calculation. + VBPERMQ V6,V10,V6 + MFVRD V6,R3 + // count leading zeroes upto the match that ends up in low 16 bits + // in both endian modes, compute index by subtracting the number by 16 + CNTLZW R3,R11 + ADD $-16,R11 + ADD R8,R11,R3 // Calculate byte address + SUB R17,R3 + RET +power8: + // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values + // in V0, V1 and V10, then branch to the preloop. + ANDCC $63,R3,R11 + BEQ CR0,qw_align + RLDICL $0,R3,$61,R11 + + MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. + CMPB R12,R5,R3 // Check for a match. + AND R9,R3,R3 // Mask bytes below s_base + RLDICR $0,R7,$60,R7 // Last doubleword in R7 + CMPU R3,$0,CR7 // If we have a match, jump to the final computation + BNE CR7,done + ADD $8,R8,R8 + ADD $-8,R4,R4 + ADD R4,R11,R4 + + // Check for quadword alignment + ANDCC $15,R8,R11 + BEQ CR0,qw_align + + // Not aligned, so handle the next doubleword + MOVD 0(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR7 + BNE CR7,done + ADD $8,R8,R8 + ADD $-8,R4,R4 + + // Either quadword aligned or 64-byte at this point. We can use LVX. +qw_align: + + // Set up auxiliary data for the vectorized algorithm. + VSPLTISB $0,V0 // Replicate 0 across V0 + VSPLTISB $3,V10 // Use V10 as control for VBPERMQ + MTVRD R5,V1 + LVSL (R0+R0),V11 + VSLB V11,V10,V10 + VSPLTB $7,V1,V1 // Replicate byte across V1 + CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop + BLE tail + + // We will load 4 quardwords per iteration in the loop, so check for + // 64-byte alignment. If 64-byte aligned, then branch to the preloop. + ANDCC $63,R8,R11 + BEQ CR0,preloop + + // Not 64-byte aligned. Load one quadword at a time until aligned. + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 // Check for byte in V4 + BNE CR6,found_qw_align + ADD $16,R8,R8 + ADD $-16,R4,R4 + + ANDCC $63,R8,R11 + BEQ CR0,preloop + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 // Check for byte in V4 + BNE CR6,found_qw_align + ADD $16,R8,R8 + ADD $-16,R4,R4 + + ANDCC $63,R8,R11 + BEQ CR0,preloop + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 // Check for byte in V4 + BNE CR6,found_qw_align + ADD $-16,R4,R4 + ADD $16,R8,R8 + + // 64-byte aligned. Prepare for the main loop. +preloop: + CMPU R4,$64 + BLE tail // If len ≤ 64, don't use the vectorized loop + + // We are now aligned to a 64-byte boundary. We will load 4 quadwords + // per loop iteration. The last doubleword is in R10, so our loop counter + // starts at (R10-R8)/64. + SUB R8,R10,R6 + SRD $6,R6,R9 // Loop counter in R9 + MOVD R9,CTR + + ADD $-64,R8,R8 // Adjust index for loop entry + MOVD $16,R11 // Load offsets for the vector loads + MOVD $32,R9 + MOVD $48,R7 + + // Main loop we will load 64 bytes per iteration +loop: + ADD $64,R8,R8 // Fuse addi+lvx for performance + LVX (R8+R0),V2 // Load 4 16-byte vectors + LVX (R8+R11),V3 + VCMPEQUB V1,V2,V6 // Look for byte in each vector + VCMPEQUB V1,V3,V7 + + LVX (R8+R9),V4 + LVX (R8+R7),V5 + VCMPEQUB V1,V4,V8 + VCMPEQUB V1,V5,V9 + + VOR V6,V7,V11 // Compress the result in a single vector + VOR V8,V9,V12 + VOR V11,V12,V13 + VCMPEQUBCC V0,V13,V14 // Check for byte + BGE CR6,found + BC 16,0,loop // bdnz loop + + // Handle the tailing bytes or R4 ≤ 64 + RLDICL $0,R6,$58,R4 + ADD $64,R8,R8 +tail: + CMPU R4,$0 + BEQ notfound + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 + BNE CR6,found_qw_align + ADD $16,R8,R8 + CMPU R4,$16,CR6 + BLE CR6,notfound + ADD $-16,R4,R4 + + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 + BNE CR6,found_qw_align + ADD $16,R8,R8 + CMPU R4,$16,CR6 + BLE CR6,notfound + ADD $-16,R4,R4 + + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 + BNE CR6,found_qw_align + ADD $16,R8,R8 + CMPU R4,$16,CR6 + BLE CR6,notfound + ADD $-16,R4,R4 + + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 + BNE CR6,found_qw_align + +notfound: + MOVD $-1, R3 + RET + +found: + // We will now compress the results into a single doubleword, + // so it can be moved to a GPR for the final index calculation. + + // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the + // first bit of each byte into bits 48-63. + VBPERMQ V6,V10,V6 + VBPERMQ V7,V10,V7 + VBPERMQ V8,V10,V8 + VBPERMQ V9,V10,V9 + + // Shift each 16-bit component into its correct position for + // merging into a single doubleword. +#ifdef GOARCH_ppc64le + VSLDOI $2,V7,V7,V7 + VSLDOI $4,V8,V8,V8 + VSLDOI $6,V9,V9,V9 +#else + VSLDOI $6,V6,V6,V6 + VSLDOI $4,V7,V7,V7 + VSLDOI $2,V8,V8,V8 +#endif + + // Merge V6-V9 into a single doubleword and move to a GPR. + VOR V6,V7,V11 + VOR V8,V9,V4 + VOR V4,V11,V4 + MFVRD V4,R3 + +#ifdef GOARCH_ppc64le + ADD $-1,R3,R11 + ANDN R3,R11,R11 + POPCNTD R11,R11 // Count trailing zeros (Little Endian). +#else + CNTLZD R3,R11 // Count leading zeros (Big Endian). +#endif + ADD R8,R11,R3 // Calculate byte address + +return: + SUB R17, R3 + RET + +found_qw_align: + // Use the same algorithm as above. Compress the result into + // a single doubleword and move it to a GPR for the final + // calculation. + VBPERMQ V6,V10,V6 + +#ifdef GOARCH_ppc64le + MFVRD V6,R3 + ADD $-1,R3,R11 + ANDN R3,R11,R11 + POPCNTD R11,R11 +#else + VSLDOI $6,V6,V6,V6 + MFVRD V6,R3 + CNTLZD R3,R11 +#endif + ADD R8,R11,R3 + CMPU R11,R4 + BLT return + BR notfound + PCALIGN $16 + +done: + ADD $-1,R10,R6 + // Offset of last index for the final + // doubleword comparison + RLDICL $0,R6,$61,R6 + // At this point, R3 has 0xFF in the same position as the byte we are + // looking for in the doubleword. Use that to calculate the exact index + // of the byte. +#ifdef GOARCH_ppc64le + ADD $-1,R3,R11 + ANDN R3,R11,R11 + POPCNTD R11,R11 // Count trailing zeros (Little Endian). +#else + CNTLZD R3,R11 // Count leading zeros (Big Endian). +#endif + CMPU R8,R7 // Check if we are at the last doubleword. + SRD $3,R11 // Convert trailing zeros to bytes. + ADD R11,R8,R3 + CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset. + BNE return + BLE CR7,return + BR notfound + +small_string: + // process string of length < 32 bytes + // We unroll this loop for better performance. + CMPU R4,$0 // Check for length=0 + BEQ notfound + + MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. + CMPB R12,R5,R3 // Check for a match. + AND R9,R3,R3 // Mask bytes below s_base. + CMPU R3,$0,CR7 // If we have a match, jump to the final computation. + RLDICR $0,R7,$60,R7 // Last doubleword in R7. + CMPU R8,R7 + BNE CR7,done + BEQ notfound // Hit length. + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BEQ notfound + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BEQ notfound + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BEQ notfound + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + BNE CR6,done + BR notfound + -- cgit v1.2.3