// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build ppc64 || ppc64le #include "go_asm.h" #include "textflag.h" TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56 // incoming: // R3 a addr -> R5 // R4 a len -> R3 // R5 a cap unused // R6 b addr -> R6 // R7 b len -> R4 // R8 b cap unused MOVD R3, R5 MOVD R4, R3 MOVD R7, R4 CMP R5,R6,CR7 CMP R3,R4,CR6 BEQ CR7,equal MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 CMP R16,$1 BNE power8 BR cmpbodyp9<>(SB) power8: BR cmpbody<>(SB) equal: BEQ CR6,done MOVD $1, R8 BGT CR6,greater NEG R8 greater: MOVD R8, R3 RET done: MOVD $0, R3 RET TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 // incoming: // R3 a addr -> R5 // R4 a len -> R3 // R5 b addr -> R6 // R6 b len -> R4 MOVD R6, R7 MOVD R5, R6 MOVD R3, R5 MOVD R4, R3 MOVD R7, R4 CMP R5,R6,CR7 CMP R3,R4,CR6 BEQ CR7,equal MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 CMP R16,$1 BNE power8 BR cmpbodyp9<>(SB) power8: BR cmpbody<>(SB) equal: BEQ CR6,done MOVD $1, R8 BGT CR6,greater NEG R8 greater: MOVD R8, R3 RET done: MOVD $0, R3 RET #ifdef GOARCH_ppc64le DATA byteswap<>+0(SB)/8, $0x0706050403020100 DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 GLOBL byteswap<>+0(SB), RODATA, $16 #define SWAP V21 #endif // Do an efficient memcmp for ppc64le/ppc64/POWER8 // R3 = a len // R4 = b len // R5 = a addr // R6 = b addr // On exit: // R3 = return value TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 MOVD R3,R8 // set up length CMP R3,R4,CR2 // unequal? BLT CR2,setuplen // BLT CR2 MOVD R4,R8 // use R4 for comparison len setuplen: CMP R8,$32 // optimize >= 32 MOVD R8,R9 BLT setup8a // optimize < 32 MOVD $16,R10 // set offsets to load into vectors CMP R8,$64 BLT cmp32 // process size 32-63 DCBT (R5) // optimize >= 64 DCBT (R6) // cache hint MOVD $32,R11 // set offsets to load into vector MOVD $48,R12 // set offsets to load into vector loop64a:// process size 64 and greater LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector VCMPEQUDCC V3,V4,V1 BGE CR6,different // jump out if its different LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector VCMPEQUDCC V3,V4,V1 BGE CR6,different LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector VCMPEQUDCC V3,V4,V1 BGE CR6,different LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector VCMPEQUDCC V3,V4,V1 BGE CR6,different ADD $-64,R9,R9 // reduce remaining size by 64 ADD $64,R5,R5 // increment to next 64 bytes of A ADD $64,R6,R6 // increment to next 64 bytes of B CMPU R9,$64 BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining CMPU R9,$32 BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining CMPU R9,$0 BNE rem // loop to rem if the remainder is not 0 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) BLT CR2,less // jump to less if len(A)+00(SB), R16 LXVD2X (R16)(R0),SWAP // Set up swap string VPERM V3,V3,SWAP,V3 VPERM V4,V4,SWAP,V4 #endif MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison MFVSRD VS36,R10 CMPU R16,R10 BEQ lower BGT greater MOVD $-1,R3 // return value if A < B RET lower: VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison MFVSRD VS35,R16 VSLDOI $8,V4,V4,V4 MFVSRD VS36,R10 CMPU R16,R10 BGT greater MOVD $-1,R3 // return value if A < B RET setup8a: SRADCC $3,R8,R9 // get the 8 byte count BEQ leftover // shifted value is 0 CMPU R8,$8 // optimize 8byte move BEQ size8 CMPU R8,$16 BEQ size16 MOVD R9,CTR // loop count for doublewords loop8: #ifdef GOARCH_ppc64le MOVDBR (R5+R0),R16 // doublewords to compare MOVDBR (R6+R0),R10 // LE compare order #else MOVD (R5+R0),R16 // doublewords to compare MOVD (R6+R0),R10 // BE compare order #endif ADD $8,R5 ADD $8,R6 CMPU R16,R10 // match? BC 8,2,loop8 // bt ctr <> 0 && cr BGT greater BLT less leftover: ANDCC $7,R8,R9 // check for leftover bytes BEQ zeroremainder simplecheck: MOVD R0,R14 CMP R9,$4 // process 4 bytes BLT halfword #ifdef GOARCH_ppc64le MOVWBR (R5)(R14),R10 MOVWBR (R6)(R14),R11 #else MOVWZ (R5)(R14),R10 MOVWZ (R6)(R14),R11 #endif CMPU R10,R11 BGT greater BLT less ADD $-4,R9 ADD $4,R14 PCALIGN $16 halfword: CMP R9,$2 // process 2 bytes BLT byte #ifdef GOARCH_ppc64le MOVHBR (R5)(R14),R10 MOVHBR (R6)(R14),R11 #else MOVHZ (R5)(R14),R10 MOVHZ (R6)(R14),R11 #endif CMPU R10,R11 BGT greater BLT less ADD $-2,R9 ADD $2,R14 PCALIGN $16 byte: CMP R9,$0 // process 1 byte BEQ skip MOVBZ (R5)(R14),R10 MOVBZ (R6)(R14),R11 CMPU R10,R11 BGT greater BLT less PCALIGN $16 skip: BEQ CR2,equal BGT CR2,greater less: MOVD $-1,R3 // return value if A < B RET size16: LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector VCMPEQUDCC V3,V4,V1 BGE CR6,different zeroremainder: BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) BLT CR2,less // jump to less if len(A) 1st len BLT CR2,less // 2nd len < 1st len equal: MOVD $0, R3 // return value if A == B RET greater: MOVD $1,R3 // return value if A > B RET // Do an efficient memcmp for ppc64le/ppc64/POWER9 // R3 = a len // R4 = b len // R5 = a addr // R6 = b addr // On exit: // R3 = return value TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0 MOVD R3,R8 // set up length CMP R3,R4,CR2 // unequal? BLT CR2,setuplen // BLT CR2 MOVD R4,R8 // use R4 for comparison len setuplen: CMP R8,$16 // optimize for size<16 MOVD R8,R9 BLT simplecheck MOVD $16,R10 // set offsets to load into vectors CMP R8,$32 // optimize for size 16-31 BLT cmp16 CMP R8,$64 BLT cmp32 // optimize for size 32-63 DCBT (R5) // optimize for size>=64 DCBT (R6) // cache hint MOVD $32,R11 // set offsets to load into vector MOVD $48,R12 // set offsets to load into vector loop64a:// process size 64 and greater LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector VCMPNEBCC V3,V4,V1 // record comparison into V1 BNE CR6,different // jump out if its different LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector VCMPNEBCC V3,V4,V1 BNE CR6,different LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector VCMPNEBCC V3,V4,V1 BNE CR6,different LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector VCMPNEBCC V3,V4,V1 BNE CR6,different ADD $-64,R9,R9 // reduce remaining size by 64 ADD $64,R5,R5 // increment to next 64 bytes of A ADD $64,R6,R6 // increment to next 64 bytes of B CMPU R9,$64 BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining CMPU R9,$32 BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining CMPU R9,$16 BGE cmp16 // loop to cmp16 if there are 16-31 bytes left CMPU R9,$0 BNE simplecheck // loop to simplecheck for remaining bytes BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) BLT CR2,less // jump to less if len(A) B RET cmp16: ANDCC $16,R9,R31 BEQ tail LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector VCMPEQUDCC V3,V4,V1 BGE CR6,different ADD $16,R5 ADD $16,R6 tail: ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b) BEQ end ADD R9,R5 ADD R9,R6 MOVD $-16,R10 LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector VCMPEQUDCC V3,V4,V1 BGE CR6,different end: BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) BLT CR2,less // jump to less if BLT CR2 that is, len(A)