// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "go_asm.h" #include "textflag.h" TEXT ·IndexByte(SB), NOSPLIT, $0-40 MOVQ b_base+0(FP), SI MOVQ b_len+8(FP), BX MOVB c+24(FP), AL LEAQ ret+32(FP), R8 JMP indexbytebody<>(SB) TEXT ·IndexByteString(SB), NOSPLIT, $0-32 MOVQ s_base+0(FP), SI MOVQ s_len+8(FP), BX MOVB c+16(FP), AL LEAQ ret+24(FP), R8 JMP indexbytebody<>(SB) // input: // SI: data // BX: data len // AL: byte sought // R8: address to put result TEXT indexbytebody<>(SB), NOSPLIT, $0 // Shuffle X0 around so that each byte contains // the character we're looking for. MOVD AX, X0 PUNPCKLBW X0, X0 PUNPCKLBW X0, X0 PSHUFL $0, X0, X0 CMPQ BX, $16 JLT small MOVQ SI, DI CMPQ BX, $32 JA avx2 sse: LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes JMP sseloopentry sseloop: // Move the next 16-byte chunk of the data into X1. MOVOU (DI), X1 // Compare bytes in X0 to X1. PCMPEQB X0, X1 // Take the top bit of each byte in X1 and put the result in DX. PMOVMSKB X1, DX // Find first set bit, if any. BSFL DX, DX JNZ ssesuccess // Advance to next block. ADDQ $16, DI sseloopentry: CMPQ DI, AX JB sseloop // Search the last 16-byte chunk. This chunk may overlap with the // chunks we've already searched, but that's ok. MOVQ AX, DI MOVOU (AX), X1 PCMPEQB X0, X1 PMOVMSKB X1, DX BSFL DX, DX JNZ ssesuccess failure: MOVQ $-1, (R8) RET // We've found a chunk containing the byte. // The chunk was loaded from DI. // The index of the matching byte in the chunk is DX. // The start of the data is SI. ssesuccess: SUBQ SI, DI // Compute offset of chunk within data. ADDQ DX, DI // Add offset of byte within chunk. MOVQ DI, (R8) RET // handle for lengths < 16 small: TESTQ BX, BX JEQ failure // Check if we'll load across a page boundary. LEAQ 16(SI), AX TESTW $0xff0, AX JEQ endofpage MOVOU (SI), X1 // Load data PCMPEQB X0, X1 // Compare target byte with each byte in data. PMOVMSKB X1, DX // Move result bits to integer register. BSFL DX, DX // Find first set bit. JZ failure // No set bit, failure. CMPL DX, BX JAE failure // Match is past end of data. MOVQ DX, (R8) RET endofpage: MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. PCMPEQB X0, X1 // Compare target byte with each byte in data. PMOVMSKB X1, DX // Move result bits to integer register. MOVL BX, CX SHLL CX, DX SHRL $16, DX // Shift desired bits down to bottom of register. BSFL DX, DX // Find first set bit. JZ failure // No set bit, failure. MOVQ DX, (R8) RET avx2: CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JNE sse MOVD AX, X0 LEAQ -32(SI)(BX*1), R11 VPBROADCASTB X0, Y1 avx2_loop: VMOVDQU (DI), Y2 VPCMPEQB Y1, Y2, Y3 VPTEST Y3, Y3 JNZ avx2success ADDQ $32, DI CMPQ DI, R11 JLT avx2_loop MOVQ R11, DI VMOVDQU (DI), Y2 VPCMPEQB Y1, Y2, Y3 VPTEST Y3, Y3 JNZ avx2success VZEROUPPER MOVQ $-1, (R8) RET avx2success: VPMOVMSKB Y3, DX BSFL DX, DX SUBQ SI, DI ADDQ DI, DX MOVQ DX, (R8) VZEROUPPER RET