summaryrefslogtreecommitdiffstats
path: root/src/liblzma/simple/arm64.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 21:12:04 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 21:12:04 +0000
commiteac54b7c4aec25060d7bd856f7cdc290943d6aae (patch)
tree9a6d81c9f88df4698e746d63d14ddafeddd918b8 /src/liblzma/simple/arm64.c
parentInitial commit. (diff)
downloadxz-utils-eac54b7c4aec25060d7bd856f7cdc290943d6aae.tar.xz
xz-utils-eac54b7c4aec25060d7bd856f7cdc290943d6aae.zip
Adding upstream version 5.4.1.upstream/5.4.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/liblzma/simple/arm64.c')
-rw-r--r--src/liblzma/simple/arm64.c137
1 files changed, 137 insertions, 0 deletions
diff --git a/src/liblzma/simple/arm64.c b/src/liblzma/simple/arm64.c
new file mode 100644
index 0000000..0fe0824
--- /dev/null
+++ b/src/liblzma/simple/arm64.c
@@ -0,0 +1,137 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file arm64.c
+/// \brief Filter for ARM64 binaries
+///
+/// This converts ARM64 relative addresses in the BL and ADRP immediates
+/// to absolute values to increase redundancy of ARM64 code.
+///
+/// Converting B or ADR instructions was also tested but it's not useful.
+/// A majority of the jumps for the B instruction are very small (+/- 0xFF).
+/// These are typical for loops and if-statements. Encoding them to their
+/// absolute address reduces redundancy since many of the small relative
+/// jump values are repeated, but very few of the absolute addresses are.
+//
+// Authors: Lasse Collin
+// Jia Tan
+// Igor Pavlov
+//
+// This file has been put into the public domain.
+// You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "simple_private.h"
+
+
+static size_t
+arm64_code(void *simple lzma_attribute((__unused__)),
+ uint32_t now_pos, bool is_encoder,
+ uint8_t *buffer, size_t size)
+{
+ size_t i;
+
+ // Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
+ // with auto-vectorization that is enabled by default with -O2.
+ // Such vectorization bloat happens with -O2 when targeting ARM64 too
+ // but performance hasn't been tested.
+#ifdef __clang__
+# pragma clang loop vectorize(disable)
+#endif
+ for (i = 0; i + 4 <= size; i += 4) {
+ uint32_t pc = (uint32_t)(now_pos + i);
+ uint32_t instr = read32le(buffer + i);
+
+ if ((instr >> 26) == 0x25) {
+ // BL instruction:
+ // The full 26-bit immediate is converted.
+ // The range is +/-128 MiB.
+ //
+ // Using the full range is helps quite a lot with
+ // big executables. Smaller range would reduce false
+ // positives in non-code sections of the input though
+ // so this is a compromise that slightly favors big
+ // files. With the full range only six bits of the 32
+ // need to match to trigger a conversion.
+ const uint32_t src = instr;
+ instr = 0x94000000;
+
+ pc >>= 2;
+ if (!is_encoder)
+ pc = 0U - pc;
+
+ instr |= (src + pc) & 0x03FFFFFF;
+ write32le(buffer + i, instr);
+
+ } else if ((instr & 0x9F000000) == 0x90000000) {
+ // ADRP instruction:
+ // Only values in the range +/-512 MiB are converted.
+ //
+ // Using less than the full +/-4 GiB range reduces
+ // false positives on non-code sections of the input
+ // while being excellent for executables up to 512 MiB.
+ // The positive effect of ADRP conversion is smaller
+ // than that of BL but it also doesn't hurt so much in
+ // non-code sections of input because, with +/-512 MiB
+ // range, nine bits of 32 need to match to trigger a
+ // conversion (two 10-bit match choices = 9 bits).
+ const uint32_t src = ((instr >> 29) & 3)
+ | ((instr >> 3) & 0x001FFFFC);
+
+ // With the addition only one branch is needed to
+ // check the +/- range. This is usually false when
+ // processing ARM64 code so branch prediction will
+ // handle it well in terms of performance.
+ //
+ //if ((src & 0x001E0000) != 0
+ // && (src & 0x001E0000) != 0x001E0000)
+ if ((src + 0x00020000) & 0x001C0000)
+ continue;
+
+ instr &= 0x9000001F;
+
+ pc >>= 12;
+ if (!is_encoder)
+ pc = 0U - pc;
+
+ const uint32_t dest = src + pc;
+ instr |= (dest & 3) << 29;
+ instr |= (dest & 0x0003FFFC) << 3;
+ instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
+ write32le(buffer + i, instr);
+ }
+ }
+
+ return i;
+}
+
+
+static lzma_ret
+arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
+ const lzma_filter_info *filters, bool is_encoder)
+{
+ return lzma_simple_coder_init(next, allocator, filters,
+ &arm64_code, 0, 4, 4, is_encoder);
+}
+
+
+#ifdef HAVE_ENCODER_ARM64
+extern lzma_ret
+lzma_simple_arm64_encoder_init(lzma_next_coder *next,
+ const lzma_allocator *allocator,
+ const lzma_filter_info *filters)
+{
+ return arm64_coder_init(next, allocator, filters, true);
+}
+#endif
+
+
+#ifdef HAVE_DECODER_ARM64
+extern lzma_ret
+lzma_simple_arm64_decoder_init(lzma_next_coder *next,
+ const lzma_allocator *allocator,
+ const lzma_filter_info *filters)
+{
+ return arm64_coder_init(next, allocator, filters, false);
+}
+#endif