// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "go_asm.h" #include "textflag.h" TEXT ·Compare(SB),NOSPLIT,$0-56 MOVQ a_base+0(FP), SI MOVQ a_len+8(FP), BX MOVQ b_base+24(FP), DI MOVQ b_len+32(FP), DX LEAQ ret+48(FP), R9 JMP cmpbody<>(SB) TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 MOVQ a_base+0(FP), SI MOVQ a_len+8(FP), BX MOVQ b_base+16(FP), DI MOVQ b_len+24(FP), DX LEAQ ret+32(FP), R9 JMP cmpbody<>(SB) // input: // SI = a // DI = b // BX = alen // DX = blen // R9 = address of output word (stores -1/0/1 here) TEXT cmpbody<>(SB),NOSPLIT,$0-0 CMPQ SI, DI JEQ allsame CMPQ BX, DX MOVQ DX, R8 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare CMPQ R8, $8 JB small CMPQ R8, $63 JBE loop CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JEQ big_loop_avx2 JMP big_loop loop: CMPQ R8, $16 JBE _0through16 MOVOU (SI), X0 MOVOU (DI), X1 PCMPEQB X0, X1 PMOVMSKB X1, AX XORQ $0xffff, AX // convert EQ to NE JNE diff16 // branch if at least one byte is not equal ADDQ $16, SI ADDQ $16, DI SUBQ $16, R8 JMP loop diff64: ADDQ $48, SI ADDQ $48, DI JMP diff16 diff48: ADDQ $32, SI ADDQ $32, DI JMP diff16 diff32: ADDQ $16, SI ADDQ $16, DI // AX = bit mask of differences diff16: BSFQ AX, BX // index of first byte that differs XORQ AX, AX MOVB (SI)(BX*1), CX CMPB CX, (DI)(BX*1) SETHI AX LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 MOVQ AX, (R9) RET // 0 through 16 bytes left, alen>=8, blen>=8 _0through16: CMPQ R8, $8 JBE _0through8 MOVQ (SI), AX MOVQ (DI), CX CMPQ AX, CX JNE diff8 _0through8: MOVQ -8(SI)(R8*1), AX MOVQ -8(DI)(R8*1), CX CMPQ AX, CX JEQ allsame // AX and CX contain parts of a and b that differ. diff8: BSWAPQ AX // reverse order of bytes BSWAPQ CX XORQ AX, CX BSRQ CX, CX // index of highest bit difference SHRQ CX, AX // move a's bit to bottom ANDQ $1, AX // mask bit LEAQ -1(AX*2), AX // 1/0 => +1/-1 MOVQ AX, (R9) RET // 0-7 bytes in common small: LEAQ (R8*8), CX // bytes left -> bits left NEGQ CX // - bits lift (== 64 - bits left mod 64) JEQ allsame // load bytes of a into high bytes of AX CMPB SI, $0xf8 JA si_high MOVQ (SI), SI JMP si_finish si_high: MOVQ -8(SI)(R8*1), SI SHRQ CX, SI si_finish: SHLQ CX, SI // load bytes of b in to high bytes of BX CMPB DI, $0xf8 JA di_high MOVQ (DI), DI JMP di_finish di_high: MOVQ -8(DI)(R8*1), DI SHRQ CX, DI di_finish: SHLQ CX, DI BSWAPQ SI // reverse order of bytes BSWAPQ DI XORQ SI, DI // find bit differences JEQ allsame BSRQ DI, CX // index of highest bit difference SHRQ CX, SI // move a's bit to bottom ANDQ $1, SI // mask bit LEAQ -1(SI*2), AX // 1/0 => +1/-1 MOVQ AX, (R9) RET allsame: XORQ AX, AX XORQ CX, CX CMPQ BX, DX SETGT AX // 1 if alen > blen SETEQ CX // 1 if alen == blen LEAQ -1(CX)(AX*2), AX // 1,0,-1 result MOVQ AX, (R9) RET // this works for >= 64 bytes of data. big_loop: MOVOU (SI), X0 MOVOU (DI), X1 PCMPEQB X0, X1 PMOVMSKB X1, AX XORQ $0xffff, AX JNE diff16 MOVOU 16(SI), X0 MOVOU 16(DI), X1 PCMPEQB X0, X1 PMOVMSKB X1, AX XORQ $0xffff, AX JNE diff32 MOVOU 32(SI), X0 MOVOU 32(DI), X1 PCMPEQB X0, X1 PMOVMSKB X1, AX XORQ $0xffff, AX JNE diff48 MOVOU 48(SI), X0 MOVOU 48(DI), X1 PCMPEQB X0, X1 PMOVMSKB X1, AX XORQ $0xffff, AX JNE diff64 ADDQ $64, SI ADDQ $64, DI SUBQ $64, R8 CMPQ R8, $64 JBE loop JMP big_loop // Compare 64-bytes per loop iteration. // Loop is unrolled and uses AVX2. big_loop_avx2: VMOVDQU (SI), Y2 VMOVDQU (DI), Y3 VMOVDQU 32(SI), Y4 VMOVDQU 32(DI), Y5 VPCMPEQB Y2, Y3, Y0 VPMOVMSKB Y0, AX XORL $0xffffffff, AX JNE diff32_avx2 VPCMPEQB Y4, Y5, Y6 VPMOVMSKB Y6, AX XORL $0xffffffff, AX JNE diff64_avx2 ADDQ $64, SI ADDQ $64, DI SUBQ $64, R8 CMPQ R8, $64 JB big_loop_avx2_exit JMP big_loop_avx2 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. diff32_avx2: VZEROUPPER JMP diff16 // Same as diff32_avx2, but for last 32 bytes. diff64_avx2: VZEROUPPER JMP diff48 // For <64 bytes remainder jump to normal loop. big_loop_avx2_exit: VZEROUPPER JMP loop