// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build ppc64 ppc64le #include "go_asm.h" #include "textflag.h" TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 MOVD b_base+0(FP), R3 // R3 = byte array pointer MOVD b_len+8(FP), R4 // R4 = length MOVBZ c+24(FP), R5 // R5 = byte MOVD $ret+32(FP), R14 // R14 = &ret BR indexbytebody<>(SB) TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32 MOVD s_base+0(FP), R3 // R3 = string MOVD s_len+8(FP), R4 // R4 = length MOVBZ c+16(FP), R5 // R5 = byte MOVD $ret+24(FP), R14 // R14 = &ret BR indexbytebody<>(SB) TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 MOVD R3,R17 // Save base address for calculating the index later. RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. ADD R4,R3,R7 // Last acceptable address in R7. DCBT (R8) // Prepare cache line. RLDIMI $16,R5,$32,R5 CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently. MOVD $-1,R9 WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28). RLDIMI $32,R5,$0,R5 MOVD R7,R10 // Save last acceptable address in R10 for later. ADD $-1,R7,R7 #ifdef GOARCH_ppc64le SLD R6,R9,R9 // Prepare mask for Little Endian #else SRD R6,R9,R9 // Same for Big Endian #endif BLE small_string // Jump to the small string case if it's ≤32 bytes. // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values // in V0, V1 and V10, then branch to the preloop. ANDCC $63,R3,R11 BEQ CR0,qw_align RLDICL $0,R3,$61,R11 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. CMPB R12,R5,R3 // Check for a match. AND R9,R3,R3 // Mask bytes below s_base RLDICL $0,R7,$61,R6 // length-1 RLDICR $0,R7,$60,R7 // Last doubleword in R7 CMPU R3,$0,CR7 // If we have a match, jump to the final computation BNE CR7,done ADD $8,R8,R8 ADD $-8,R4,R4 ADD R4,R11,R4 // Check for quadword alignment ANDCC $15,R8,R11 BEQ CR0,qw_align // Not aligned, so handle the next doubleword MOVD 0(R8),R12 CMPB R12,R5,R3 CMPU R3,$0,CR7 BNE CR7,done ADD $8,R8,R8 ADD $-8,R4,R4 // Either quadword aligned or 64-byte at this point. We can use LVX. qw_align: // Set up auxiliary data for the vectorized algorithm. VSPLTISB $0,V0 // Replicate 0 across V0 VSPLTISB $3,V10 // Use V10 as control for VBPERMQ MTVRD R5,V1 LVSL (R0+R0),V11 VSLB V11,V10,V10 VSPLTB $7,V1,V1 // Replicate byte across V1 CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop BLE tail // We will load 4 quardwords per iteration in the loop, so check for // 64-byte alignment. If 64-byte aligned, then branch to the preloop. ANDCC $63,R8,R11 BEQ CR0,preloop // Not 64-byte aligned. Load one quadword at a time until aligned. LVX (R8+R0),V4 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 BNE CR6,found_qw_align ADD $16,R8,R8 ADD $-16,R4,R4 ANDCC $63,R8,R11 BEQ CR0,preloop LVX (R8+R0),V4 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 BNE CR6,found_qw_align ADD $16,R8,R8 ADD $-16,R4,R4 ANDCC $63,R8,R11 BEQ CR0,preloop LVX (R8+R0),V4 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 BNE CR6,found_qw_align ADD $-16,R4,R4 ADD $16,R8,R8 // 64-byte aligned. Prepare for the main loop. preloop: CMPU R4,$64 BLE tail // If len ≤ 64, don't use the vectorized loop // We are now aligned to a 64-byte boundary. We will load 4 quadwords // per loop iteration. The last doubleword is in R10, so our loop counter // starts at (R10-R8)/64. SUB R8,R10,R6 SRD $6,R6,R9 // Loop counter in R9 MOVD R9,CTR ADD $-64,R8,R8 // Adjust index for loop entry MOVD $16,R11 // Load offsets for the vector loads MOVD $32,R9 MOVD $48,R7 // Main loop we will load 64 bytes per iteration loop: ADD $64,R8,R8 // Fuse addi+lvx for performance LVX (R8+R0),V2 // Load 4 16-byte vectors LVX (R8+R11),V3 VCMPEQUB V1,V2,V6 // Look for byte in each vector VCMPEQUB V1,V3,V7 LVX (R8+R9),V4 LVX (R8+R7),V5 VCMPEQUB V1,V4,V8 VCMPEQUB V1,V5,V9 VOR V6,V7,V11 // Compress the result in a single vector VOR V8,V9,V12 VOR V11,V12,V13 VCMPEQUBCC V0,V13,V14 // Check for byte BGE CR6,found BC 16,0,loop // bdnz loop // Handle the tailing bytes or R4 ≤ 64 RLDICL $0,R6,$58,R4 ADD $64,R8,R8 tail: CMPU R4,$0 BEQ notfound LVX (R8+R0),V4 VCMPEQUBCC V1,V4,V6 BNE CR6,found_qw_align ADD $16,R8,R8 CMPU R4,$16,CR6 BLE CR6,notfound ADD $-16,R4,R4 LVX (R8+R0),V4 VCMPEQUBCC V1,V4,V6 BNE CR6,found_qw_align ADD $16,R8,R8 CMPU R4,$16,CR6 BLE CR6,notfound ADD $-16,R4,R4 LVX (R8+R0),V4 VCMPEQUBCC V1,V4,V6 BNE CR6,found_qw_align ADD $16,R8,R8 CMPU R4,$16,CR6 BLE CR6,notfound ADD $-16,R4,R4 LVX (R8+R0),V4 VCMPEQUBCC V1,V4,V6 BNE CR6,found_qw_align notfound: MOVD $-1,R3 MOVD R3,(R14) RET found: // We will now compress the results into a single doubleword, // so it can be moved to a GPR for the final index calculation. // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the // first bit of each byte into bits 48-63. VBPERMQ V6,V10,V6 VBPERMQ V7,V10,V7 VBPERMQ V8,V10,V8 VBPERMQ V9,V10,V9 // Shift each 16-bit component into its correct position for // merging into a single doubleword. #ifdef GOARCH_ppc64le VSLDOI $2,V7,V7,V7 VSLDOI $4,V8,V8,V8 VSLDOI $6,V9,V9,V9 #else VSLDOI $6,V6,V6,V6 VSLDOI $4,V7,V7,V7 VSLDOI $2,V8,V8,V8 #endif // Merge V6-V9 into a single doubleword and move to a GPR. VOR V6,V7,V11 VOR V8,V9,V4 VOR V4,V11,V4 MFVRD V4,R3 #ifdef GOARCH_ppc64le ADD $-1,R3,R11 ANDN R3,R11,R11 POPCNTD R11,R11 // Count trailing zeros (Little Endian). #else CNTLZD R3,R11 // Count leading zeros (Big Endian). #endif ADD R8,R11,R3 // Calculate byte address return: SUB R17,R3 MOVD R3,(R14) RET found_qw_align: // Use the same algorithm as above. Compress the result into // a single doubleword and move it to a GPR for the final // calculation. VBPERMQ V6,V10,V6 #ifdef GOARCH_ppc64le MFVRD V6,R3 ADD $-1,R3,R11 ANDN R3,R11,R11 POPCNTD R11,R11 #else VSLDOI $6,V6,V6,V6 MFVRD V6,R3 CNTLZD R3,R11 #endif ADD R8,R11,R3 CMPU R11,R4 BLT return BR notfound done: // At this point, R3 has 0xFF in the same position as the byte we are // looking for in the doubleword. Use that to calculate the exact index // of the byte. #ifdef GOARCH_ppc64le ADD $-1,R3,R11 ANDN R3,R11,R11 POPCNTD R11,R11 // Count trailing zeros (Little Endian). #else CNTLZD R3,R11 // Count leading zeros (Big Endian). #endif CMPU R8,R7 // Check if we are at the last doubleword. SRD $3,R11 // Convert trailing zeros to bytes. ADD R11,R8,R3 CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset. BNE return BLE CR7,return BR notfound small_string: // We unroll this loop for better performance. CMPU R4,$0 // Check for length=0 BEQ notfound MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. CMPB R12,R5,R3 // Check for a match. AND R9,R3,R3 // Mask bytes below s_base. CMPU R3,$0,CR7 // If we have a match, jump to the final computation. RLDICL $0,R7,$61,R6 // length-1 RLDICR $0,R7,$60,R7 // Last doubleword in R7. CMPU R8,R7 BNE CR7,done BEQ notfound // Hit length. MOVDU 8(R8),R12 CMPB R12,R5,R3 CMPU R3,$0,CR6 CMPU R8,R7 BNE CR6,done BEQ notfound MOVDU 8(R8),R12 CMPB R12,R5,R3 CMPU R3,$0,CR6 CMPU R8,R7 BNE CR6,done BEQ notfound MOVDU 8(R8),R12 CMPB R12,R5,R3 CMPU R3,$0,CR6 CMPU R8,R7 BNE CR6,done BEQ notfound MOVDU 8(R8),R12 CMPB R12,R5,R3 CMPU R3,$0,CR6 BNE CR6,done BR notfound