summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg/index_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/internal/bytealg/index_arm64.s206
1 files changed, 206 insertions, 0 deletions
diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s
new file mode 100644
index 0000000..3a551a7
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.s
@@ -0,0 +1,206 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+ MOVD a_base+0(FP), R0
+ MOVD a_len+8(FP), R1
+ MOVD b_base+24(FP), R2
+ MOVD b_len+32(FP), R3
+ MOVD $ret+48(FP), R9
+ B indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+ MOVD a_base+0(FP), R0
+ MOVD a_len+8(FP), R1
+ MOVD b_base+16(FP), R2
+ MOVD b_len+24(FP), R3
+ MOVD $ret+32(FP), R9
+ B indexbody<>(SB)
+
+// input:
+// R0: haystack
+// R1: length of haystack
+// R2: needle
+// R3: length of needle (2 <= len <= 32)
+// R9: address to put result
+TEXT indexbody<>(SB),NOSPLIT,$0-56
+ // main idea is to load 'sep' into separate register(s)
+ // to avoid repeatedly re-load it again and again
+ // for sebsequent substring comparisons
+ SUB R3, R1, R4
+ // R4 contains the start of last substring for comparison
+ ADD R0, R4, R4
+ ADD $1, R0, R8
+
+ CMP $8, R3
+ BHI greater_8
+ TBZ $3, R3, len_2_7
+len_8:
+ // R5 contains 8-byte of sep
+ MOVD (R2), R5
+loop_8:
+ // R6 contains substring for comparison
+ CMP R4, R0
+ BHI not_found
+ MOVD.P 1(R0), R6
+ CMP R5, R6
+ BNE loop_8
+ B found
+len_2_7:
+ TBZ $2, R3, len_2_3
+ TBZ $1, R3, len_4_5
+ TBZ $0, R3, len_6
+len_7:
+ // R5 and R6 contain 7-byte of sep
+ MOVWU (R2), R5
+ // 1-byte overlap with R5
+ MOVWU 3(R2), R6
+loop_7:
+ CMP R4, R0
+ BHI not_found
+ MOVWU.P 1(R0), R3
+ CMP R5, R3
+ BNE loop_7
+ MOVWU 2(R0), R3
+ CMP R6, R3
+ BNE loop_7
+ B found
+len_6:
+ // R5 and R6 contain 6-byte of sep
+ MOVWU (R2), R5
+ MOVHU 4(R2), R6
+loop_6:
+ CMP R4, R0
+ BHI not_found
+ MOVWU.P 1(R0), R3
+ CMP R5, R3
+ BNE loop_6
+ MOVHU 3(R0), R3
+ CMP R6, R3
+ BNE loop_6
+ B found
+len_4_5:
+ TBZ $0, R3, len_4
+len_5:
+ // R5 and R7 contain 5-byte of sep
+ MOVWU (R2), R5
+ MOVBU 4(R2), R7
+loop_5:
+ CMP R4, R0
+ BHI not_found
+ MOVWU.P 1(R0), R3
+ CMP R5, R3
+ BNE loop_5
+ MOVBU 3(R0), R3
+ CMP R7, R3
+ BNE loop_5
+ B found
+len_4:
+ // R5 contains 4-byte of sep
+ MOVWU (R2), R5
+loop_4:
+ CMP R4, R0
+ BHI not_found
+ MOVWU.P 1(R0), R6
+ CMP R5, R6
+ BNE loop_4
+ B found
+len_2_3:
+ TBZ $0, R3, len_2
+len_3:
+ // R6 and R7 contain 3-byte of sep
+ MOVHU (R2), R6
+ MOVBU 2(R2), R7
+loop_3:
+ CMP R4, R0
+ BHI not_found
+ MOVHU.P 1(R0), R3
+ CMP R6, R3
+ BNE loop_3
+ MOVBU 1(R0), R3
+ CMP R7, R3
+ BNE loop_3
+ B found
+len_2:
+ // R5 contains 2-byte of sep
+ MOVHU (R2), R5
+loop_2:
+ CMP R4, R0
+ BHI not_found
+ MOVHU.P 1(R0), R6
+ CMP R5, R6
+ BNE loop_2
+found:
+ SUB R8, R0, R0
+ MOVD R0, (R9)
+ RET
+not_found:
+ MOVD $-1, R0
+ MOVD R0, (R9)
+ RET
+greater_8:
+ SUB $9, R3, R11 // len(sep) - 9, offset of R0 for last 8 bytes
+ CMP $16, R3
+ BHI greater_16
+len_9_16:
+ MOVD.P 8(R2), R5 // R5 contains the first 8-byte of sep
+ SUB $16, R3, R7 // len(sep) - 16, offset of R2 for last 8 bytes
+ MOVD (R2)(R7), R6 // R6 contains the last 8-byte of sep
+loop_9_16:
+ // search the first 8 bytes first
+ CMP R4, R0
+ BHI not_found
+ MOVD.P 1(R0), R7
+ CMP R5, R7
+ BNE loop_9_16
+ MOVD (R0)(R11), R7
+ CMP R6, R7 // compare the last 8 bytes
+ BNE loop_9_16
+ B found
+greater_16:
+ CMP $24, R3
+ BHI len_25_32
+len_17_24:
+ LDP.P 16(R2), (R5, R6) // R5 and R6 contain the first 16-byte of sep
+ SUB $24, R3, R10 // len(sep) - 24
+ MOVD (R2)(R10), R7 // R7 contains the last 8-byte of sep
+loop_17_24:
+ // search the first 16 bytes first
+ CMP R4, R0
+ BHI not_found
+ MOVD.P 1(R0), R10
+ CMP R5, R10
+ BNE loop_17_24
+ MOVD 7(R0), R10
+ CMP R6, R10
+ BNE loop_17_24
+ MOVD (R0)(R11), R10
+ CMP R7, R10 // compare the last 8 bytes
+ BNE loop_17_24
+ B found
+len_25_32:
+ LDP.P 16(R2), (R5, R6)
+ MOVD.P 8(R2), R7 // R5, R6 and R7 contain the first 24-byte of sep
+ SUB $32, R3, R12 // len(sep) - 32
+ MOVD (R2)(R12), R10 // R10 contains the last 8-byte of sep
+loop_25_32:
+ // search the first 24 bytes first
+ CMP R4, R0
+ BHI not_found
+ MOVD.P 1(R0), R12
+ CMP R5, R12
+ BNE loop_25_32
+ MOVD 7(R0), R12
+ CMP R6, R12
+ BNE loop_25_32
+ MOVD 15(R0), R12
+ CMP R7, R12
+ BNE loop_25_32
+ MOVD (R0)(R11), R12
+ CMP R10, R12 // compare the last 8 bytes
+ BNE loop_25_32
+ B found