From 73df946d56c74384511a194dd01dbe099584fd1a Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 28 Apr 2024 15:14:23 +0200 Subject: Adding upstream version 1.16.10. Signed-off-by: Daniel Baumann --- src/internal/bytealg/index_arm64.s | 206 +++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 src/internal/bytealg/index_arm64.s (limited to 'src/internal/bytealg/index_arm64.s') diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s new file mode 100644 index 0000000..3a551a7 --- /dev/null +++ b/src/internal/bytealg/index_arm64.s @@ -0,0 +1,206 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Index(SB),NOSPLIT,$0-56 + MOVD a_base+0(FP), R0 + MOVD a_len+8(FP), R1 + MOVD b_base+24(FP), R2 + MOVD b_len+32(FP), R3 + MOVD $ret+48(FP), R9 + B indexbody<>(SB) + +TEXT ·IndexString(SB),NOSPLIT,$0-40 + MOVD a_base+0(FP), R0 + MOVD a_len+8(FP), R1 + MOVD b_base+16(FP), R2 + MOVD b_len+24(FP), R3 + MOVD $ret+32(FP), R9 + B indexbody<>(SB) + +// input: +// R0: haystack +// R1: length of haystack +// R2: needle +// R3: length of needle (2 <= len <= 32) +// R9: address to put result +TEXT indexbody<>(SB),NOSPLIT,$0-56 + // main idea is to load 'sep' into separate register(s) + // to avoid repeatedly re-load it again and again + // for sebsequent substring comparisons + SUB R3, R1, R4 + // R4 contains the start of last substring for comparison + ADD R0, R4, R4 + ADD $1, R0, R8 + + CMP $8, R3 + BHI greater_8 + TBZ $3, R3, len_2_7 +len_8: + // R5 contains 8-byte of sep + MOVD (R2), R5 +loop_8: + // R6 contains substring for comparison + CMP R4, R0 + BHI not_found + MOVD.P 1(R0), R6 + CMP R5, R6 + BNE loop_8 + B found +len_2_7: + TBZ $2, R3, len_2_3 + TBZ $1, R3, len_4_5 + TBZ $0, R3, len_6 +len_7: + // R5 and R6 contain 7-byte of sep + MOVWU (R2), R5 + // 1-byte overlap with R5 + MOVWU 3(R2), R6 +loop_7: + CMP R4, R0 + BHI not_found + MOVWU.P 1(R0), R3 + CMP R5, R3 + BNE loop_7 + MOVWU 2(R0), R3 + CMP R6, R3 + BNE loop_7 + B found +len_6: + // R5 and R6 contain 6-byte of sep + MOVWU (R2), R5 + MOVHU 4(R2), R6 +loop_6: + CMP R4, R0 + BHI not_found + MOVWU.P 1(R0), R3 + CMP R5, R3 + BNE loop_6 + MOVHU 3(R0), R3 + CMP R6, R3 + BNE loop_6 + B found +len_4_5: + TBZ $0, R3, len_4 +len_5: + // R5 and R7 contain 5-byte of sep + MOVWU (R2), R5 + MOVBU 4(R2), R7 +loop_5: + CMP R4, R0 + BHI not_found + MOVWU.P 1(R0), R3 + CMP R5, R3 + BNE loop_5 + MOVBU 3(R0), R3 + CMP R7, R3 + BNE loop_5 + B found +len_4: + // R5 contains 4-byte of sep + MOVWU (R2), R5 +loop_4: + CMP R4, R0 + BHI not_found + MOVWU.P 1(R0), R6 + CMP R5, R6 + BNE loop_4 + B found +len_2_3: + TBZ $0, R3, len_2 +len_3: + // R6 and R7 contain 3-byte of sep + MOVHU (R2), R6 + MOVBU 2(R2), R7 +loop_3: + CMP R4, R0 + BHI not_found + MOVHU.P 1(R0), R3 + CMP R6, R3 + BNE loop_3 + MOVBU 1(R0), R3 + CMP R7, R3 + BNE loop_3 + B found +len_2: + // R5 contains 2-byte of sep + MOVHU (R2), R5 +loop_2: + CMP R4, R0 + BHI not_found + MOVHU.P 1(R0), R6 + CMP R5, R6 + BNE loop_2 +found: + SUB R8, R0, R0 + MOVD R0, (R9) + RET +not_found: + MOVD $-1, R0 + MOVD R0, (R9) + RET +greater_8: + SUB $9, R3, R11 // len(sep) - 9, offset of R0 for last 8 bytes + CMP $16, R3 + BHI greater_16 +len_9_16: + MOVD.P 8(R2), R5 // R5 contains the first 8-byte of sep + SUB $16, R3, R7 // len(sep) - 16, offset of R2 for last 8 bytes + MOVD (R2)(R7), R6 // R6 contains the last 8-byte of sep +loop_9_16: + // search the first 8 bytes first + CMP R4, R0 + BHI not_found + MOVD.P 1(R0), R7 + CMP R5, R7 + BNE loop_9_16 + MOVD (R0)(R11), R7 + CMP R6, R7 // compare the last 8 bytes + BNE loop_9_16 + B found +greater_16: + CMP $24, R3 + BHI len_25_32 +len_17_24: + LDP.P 16(R2), (R5, R6) // R5 and R6 contain the first 16-byte of sep + SUB $24, R3, R10 // len(sep) - 24 + MOVD (R2)(R10), R7 // R7 contains the last 8-byte of sep +loop_17_24: + // search the first 16 bytes first + CMP R4, R0 + BHI not_found + MOVD.P 1(R0), R10 + CMP R5, R10 + BNE loop_17_24 + MOVD 7(R0), R10 + CMP R6, R10 + BNE loop_17_24 + MOVD (R0)(R11), R10 + CMP R7, R10 // compare the last 8 bytes + BNE loop_17_24 + B found +len_25_32: + LDP.P 16(R2), (R5, R6) + MOVD.P 8(R2), R7 // R5, R6 and R7 contain the first 24-byte of sep + SUB $32, R3, R12 // len(sep) - 32 + MOVD (R2)(R12), R10 // R10 contains the last 8-byte of sep +loop_25_32: + // search the first 24 bytes first + CMP R4, R0 + BHI not_found + MOVD.P 1(R0), R12 + CMP R5, R12 + BNE loop_25_32 + MOVD 7(R0), R12 + CMP R6, R12 + BNE loop_25_32 + MOVD 15(R0), R12 + CMP R7, R12 + BNE loop_25_32 + MOVD (R0)(R11), R12 + CMP R10, R12 // compare the last 8 bytes + BNE loop_25_32 + B found -- cgit v1.2.3